Spaces:
Running
Running
Delanoe Pirard
commited on
Commit
·
114747f
1
Parent(s):
cc3d383
Lot's of changes
Browse files- agents/advanced_validation_agent.py +8 -8
- agents/annexe_autres_elements.tex +0 -0
- agents/code_agent.py +35 -33
- agents/figure_interpretation_agent.py +0 -298
- agents/image_analyzer_agent.py +1 -1
- agents/long_context_management_agent.py +6 -5
- agents/math_agent.py +2 -1
- agents/planner_agent.py +12 -14
- agents/reasoning_agent.py +18 -7
- agents/research_agent.py +596 -61
- agents/role_agent.py +2 -1
- agents/synthesis_agent.py +153 -0
- agents/text_analyzer_agent.py +3 -120
- agents/verifier_agent.py +0 -296
- agents/video_analyzer_agent.py +109 -31
- app.py +95 -37
- cookies.txt +16 -0
- gaia_improvement_plan.md +0 -943
- get_cookie.py +10 -1
- packages.txt +1 -0
- prompts/code_gen_prompt.txt +1 -2
- prompts/image_analyzer_prompt.txt +38 -38
- prompts/planner_agent_prompt.txt +9 -4
- prompts/reasoning_agent_prompt.txt +3 -3
- prompts/text_analyzer_prompt.txt +29 -29
- pyproject.toml +3 -0
- requirements.txt +2 -0
- todo.md +0 -44
- uv.lock +60 -0
agents/advanced_validation_agent.py
CHANGED
@@ -45,7 +45,7 @@ def cross_reference_check(claim: str, sources_content: List[Dict[str, str]]) ->
|
|
45 |
return {"error": "No source content provided for cross-referencing."}
|
46 |
|
47 |
# LLM configuration
|
48 |
-
llm_model = os.getenv("VALIDATION_LLM_MODEL", "
|
49 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
50 |
if not gemini_api_key:
|
51 |
logger.error("GEMINI_API_KEY not found for cross-referencing LLM.")
|
@@ -53,7 +53,7 @@ def cross_reference_check(claim: str, sources_content: List[Dict[str, str]]) ->
|
|
53 |
|
54 |
results = []
|
55 |
try:
|
56 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
57 |
logger.info(f"Using cross-referencing LLM: {llm_model}")
|
58 |
|
59 |
for i, source in enumerate(sources_content):
|
@@ -114,7 +114,7 @@ def logical_consistency_check(text: str) -> Dict[str, Union[bool, str, List[str]
|
|
114 |
logger.info(f"Checking logical consistency for text (length: {len(text)} chars).")
|
115 |
|
116 |
# LLM configuration
|
117 |
-
llm_model = os.getenv("VALIDATION_LLM_MODEL", "
|
118 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
119 |
if not gemini_api_key:
|
120 |
logger.error("GEMINI_API_KEY not found for consistency check LLM.")
|
@@ -138,7 +138,7 @@ def logical_consistency_check(text: str) -> Dict[str, Union[bool, str, List[str]
|
|
138 |
)
|
139 |
|
140 |
try:
|
141 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
142 |
logger.info(f"Using consistency check LLM: {llm_model}")
|
143 |
response = llm.complete(prompt)
|
144 |
|
@@ -174,7 +174,7 @@ def bias_detection(text: str, source_context: Optional[str] = None) -> Dict[str,
|
|
174 |
logger.info(f"Detecting bias in text (length: {len(text)} chars). Context provided: {source_context is not None}")
|
175 |
|
176 |
# LLM configuration
|
177 |
-
llm_model = os.getenv("VALIDATION_LLM_MODEL", "
|
178 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
179 |
if not gemini_api_key:
|
180 |
logger.error("GEMINI_API_KEY not found for bias detection LLM.")
|
@@ -203,7 +203,7 @@ def bias_detection(text: str, source_context: Optional[str] = None) -> Dict[str,
|
|
203 |
)
|
204 |
|
205 |
try:
|
206 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
207 |
logger.info(f"Using bias detection LLM: {llm_model}")
|
208 |
response = llm.complete(prompt)
|
209 |
|
@@ -300,7 +300,7 @@ def initialize_advanced_validation_agent() -> ReActAgent:
|
|
300 |
logger.info("Initializing AdvancedValidationAgent...")
|
301 |
|
302 |
# Configuration for the agent's main LLM
|
303 |
-
agent_llm_model = os.getenv("VALIDATION_AGENT_LLM_MODEL", "
|
304 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
305 |
|
306 |
if not gemini_api_key:
|
@@ -308,7 +308,7 @@ def initialize_advanced_validation_agent() -> ReActAgent:
|
|
308 |
raise ValueError("GEMINI_API_KEY must be set for AdvancedValidationAgent")
|
309 |
|
310 |
try:
|
311 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
312 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
313 |
|
314 |
# Load system prompt
|
|
|
45 |
return {"error": "No source content provided for cross-referencing."}
|
46 |
|
47 |
# LLM configuration
|
48 |
+
llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use a capable model
|
49 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
50 |
if not gemini_api_key:
|
51 |
logger.error("GEMINI_API_KEY not found for cross-referencing LLM.")
|
|
|
53 |
|
54 |
results = []
|
55 |
try:
|
56 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
57 |
logger.info(f"Using cross-referencing LLM: {llm_model}")
|
58 |
|
59 |
for i, source in enumerate(sources_content):
|
|
|
114 |
logger.info(f"Checking logical consistency for text (length: {len(text)} chars).")
|
115 |
|
116 |
# LLM configuration
|
117 |
+
llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
118 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
119 |
if not gemini_api_key:
|
120 |
logger.error("GEMINI_API_KEY not found for consistency check LLM.")
|
|
|
138 |
)
|
139 |
|
140 |
try:
|
141 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
142 |
logger.info(f"Using consistency check LLM: {llm_model}")
|
143 |
response = llm.complete(prompt)
|
144 |
|
|
|
174 |
logger.info(f"Detecting bias in text (length: {len(text)} chars). Context provided: {source_context is not None}")
|
175 |
|
176 |
# LLM configuration
|
177 |
+
llm_model = os.getenv("VALIDATION_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
178 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
179 |
if not gemini_api_key:
|
180 |
logger.error("GEMINI_API_KEY not found for bias detection LLM.")
|
|
|
203 |
)
|
204 |
|
205 |
try:
|
206 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
207 |
logger.info(f"Using bias detection LLM: {llm_model}")
|
208 |
response = llm.complete(prompt)
|
209 |
|
|
|
300 |
logger.info("Initializing AdvancedValidationAgent...")
|
301 |
|
302 |
# Configuration for the agent's main LLM
|
303 |
+
agent_llm_model = os.getenv("VALIDATION_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use Pro for main agent logic
|
304 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
305 |
|
306 |
if not gemini_api_key:
|
|
|
308 |
raise ValueError("GEMINI_API_KEY must be set for AdvancedValidationAgent")
|
309 |
|
310 |
try:
|
311 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
312 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
313 |
|
314 |
# Load system prompt
|
agents/annexe_autres_elements.tex
ADDED
File without changes
|
agents/code_agent.py
CHANGED
@@ -64,7 +64,7 @@ def generate_python_code(prompt: str) -> str:
|
|
64 |
model=gen_llm_model,
|
65 |
api_key=gen_api_key,
|
66 |
reasoning_effort="high",
|
67 |
-
temperature=0.
|
68 |
max_tokens=16384
|
69 |
)
|
70 |
logger.info(f"Using code generation LLM: {gen_llm_model}")
|
@@ -116,7 +116,7 @@ def initialize_code_agent() -> ReActAgent:
|
|
116 |
logger.info("Initializing CodeAgent...")
|
117 |
|
118 |
# Configuration for the agent's main LLM
|
119 |
-
agent_llm_model = os.getenv("CODE_AGENT_LLM_MODEL", "
|
120 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
121 |
|
122 |
if not gemini_api_key:
|
@@ -127,6 +127,7 @@ def initialize_code_agent() -> ReActAgent:
|
|
127 |
llm = GoogleGenAI(
|
128 |
api_key=gemini_api_key,
|
129 |
model=agent_llm_model,
|
|
|
130 |
)
|
131 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
132 |
|
@@ -147,37 +148,38 @@ def initialize_code_agent() -> ReActAgent:
|
|
147 |
agent = ReActAgent(
|
148 |
name="code_agent",
|
149 |
description=(
|
150 |
-
"Generates Python code using `python_code_generator` and executes it safely
|
151 |
-
"
|
152 |
-
"The agent
|
153 |
-
"- beautifulsoup4>=4.13.4\n"
|
154 |
-
"- certifi>=2025.4.26\n"
|
155 |
-
"- datasets>=3.5.1\n"
|
156 |
-
"- dotenv>=0.9.9\n"
|
157 |
-
"- duckdb>=1.2.2\n"
|
158 |
-
"- ffmpeg-python>=0.2.0\n"
|
159 |
-
"- gradio[oauth]>=5.28.0\n"
|
160 |
-
"- helium>=5.1.1\n"
|
161 |
-
"- huggingface>=0.0.1\n"
|
162 |
-
"- imageio>=2.37.0
|
163 |
-
"- matplotlib>=3.10.1\n"
|
164 |
-
"- numpy>=2.2.5\n"
|
165 |
-
"- openai-whisper>=20240930\n"
|
166 |
-
"- opencv-python>=4.11.0.86\n"
|
167 |
-
"- openpyxl>=3.1.5\n"
|
168 |
-
"- pandas>=2.2.3\n"
|
169 |
-
"- pyarrow>=20.0.0\n"
|
170 |
-
"- pygame>=2.6.1\n"
|
171 |
-
"- python-chess>=1.999\n"
|
172 |
-
"- requests>=2.32.3\n"
|
173 |
-
"- scikit-learn>=1.6.1\n"
|
174 |
-
"- scipy>=1.15.2\n"
|
175 |
-
"- seaborn>=0.13.2\n"
|
176 |
-
"- sqlalchemy>=2.0.40\n"
|
177 |
-
"- statsmodels>=0.14.4\n"
|
178 |
-
"-
|
179 |
-
"-
|
180 |
-
"-
|
|
|
181 |
),
|
182 |
# REMOVED: code_execute_fn - Execution is handled by the code_interpreter tool via the agent loop.
|
183 |
tools=[
|
|
|
64 |
model=gen_llm_model,
|
65 |
api_key=gen_api_key,
|
66 |
reasoning_effort="high",
|
67 |
+
temperature=0.05,
|
68 |
max_tokens=16384
|
69 |
)
|
70 |
logger.info(f"Using code generation LLM: {gen_llm_model}")
|
|
|
116 |
logger.info("Initializing CodeAgent...")
|
117 |
|
118 |
# Configuration for the agent's main LLM
|
119 |
+
agent_llm_model = os.getenv("CODE_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
120 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
121 |
|
122 |
if not gemini_api_key:
|
|
|
127 |
llm = GoogleGenAI(
|
128 |
api_key=gemini_api_key,
|
129 |
model=agent_llm_model,
|
130 |
+
temperature=0.05
|
131 |
)
|
132 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
133 |
|
|
|
148 |
agent = ReActAgent(
|
149 |
name="code_agent",
|
150 |
description=(
|
151 |
+
"Generates Python code using `python_code_generator` and executes it safely with "
|
152 |
+
"`code_interpreter`, then iteratively debugs and refines the code from run-time feedback.\n\n"
|
153 |
+
"The agent can leverage the following pre-installed packages:\n"
|
154 |
+
"- beautifulsoup4>=4.13.4 : HTML/XML parsing and lightweight web scraping\n"
|
155 |
+
"- certifi>=2025.4.26 : Mozilla CA bundle for secure TLS/SSL requests\n"
|
156 |
+
"- datasets>=3.5.1 : Hugging Face dataset loading and streaming\n"
|
157 |
+
"- dotenv>=0.9.9 : Load environment variables from .env files\n"
|
158 |
+
"- duckdb>=1.2.2 : In‑process OLAP SQL engine (analytics, Parquet, Arrow)\n"
|
159 |
+
"- ffmpeg-python>=0.2.0 : Wrapper around FFmpeg for audio/video operations\n"
|
160 |
+
"- gradio[oauth]>=5.28.0 : Rapid web‑UI prototyping with optional OAuth\n"
|
161 |
+
"- helium>=5.1.1 : High‑level Selenium / browser automation toolkit\n"
|
162 |
+
"- huggingface>=0.0.1 : Interact with Hugging Face Hub models, datasets, spaces\n"
|
163 |
+
"- imageio>=2.37.0 : Read and write images, GIFs, MP4s, volumes, etc.\n"
|
164 |
+
"- matplotlib>=3.10.1 : 2‑D plotting (figures, axes, annotations)\n"
|
165 |
+
"- numpy>=2.2.5 : N‑dimensional arrays and vectorized math\n"
|
166 |
+
"- openai-whisper>=20240930 : Speech‑to‑text transcription\n"
|
167 |
+
"- opencv-python>=4.11.0.86 : Computer vision, image/video processing\n"
|
168 |
+
"- openpyxl>=3.1.5 : Excel .xlsx read/write, styles, formulas\n"
|
169 |
+
"- pandas>=2.2.3 : DataFrames, time series, CSV/Parquet I/O\n"
|
170 |
+
"- pyarrow>=20.0.0 : Apache Arrow tables, Parquet, Flight RPC\n"
|
171 |
+
"- pygame>=2.6.1 : Simple 2‑D game/graphics engine (SDL based)\n"
|
172 |
+
"- python-chess>=1.999 : Chess move generation, PGN/FEN handling, engines\n"
|
173 |
+
"- requests>=2.32.3 : HTTP/HTTPS client with sessions and retries\n"
|
174 |
+
"- scikit-learn>=1.6.1 : Machine‑learning algorithms, preprocessing, pipelines\n"
|
175 |
+
"- scipy>=1.15.2 : Scientific computing, optimization, signal processing\n"
|
176 |
+
"- seaborn>=0.13.2 : Statistical visualization on top of matplotlib\n"
|
177 |
+
"- sqlalchemy>=2.0.40 : SQL ORM and core engine for many databases\n"
|
178 |
+
"- statsmodels>=0.14.4 : Econometrics and statistical modeling (GLM, ARIMA)\n"
|
179 |
+
"- stockfish==3.28.0 : UCI interface to Stockfish chess engine\n"
|
180 |
+
"- sympy>=1.14.0 : Symbolic math, algebra, calculus CAS\n"
|
181 |
+
"- youtube-transcript-api>=1.0.3 : Fetch YouTube video transcripts via API\n"
|
182 |
+
"- yt-dlp>=2025.3.31 : Download videos/playlists from YouTube and other sites\n"
|
183 |
),
|
184 |
# REMOVED: code_execute_fn - Execution is handled by the code_interpreter tool via the agent loop.
|
185 |
tools=[
|
agents/figure_interpretation_agent.py
DELETED
@@ -1,298 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import logging
|
3 |
-
|
4 |
-
from llama_index.core.agent.workflow import ReActAgent
|
5 |
-
from llama_index.core.schema import ImageDocument
|
6 |
-
from llama_index.core.tools import FunctionTool
|
7 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
8 |
-
|
9 |
-
# Setup logging
|
10 |
-
logger = logging.getLogger(__name__)
|
11 |
-
|
12 |
-
# Helper function to load prompt from file
|
13 |
-
def load_prompt_from_file(filename: str, default_prompt: str) -> str:
|
14 |
-
"""Loads a prompt from a text file."""
|
15 |
-
try:
|
16 |
-
script_dir = os.path.dirname(__file__)
|
17 |
-
prompt_path = os.path.join(script_dir, filename)
|
18 |
-
with open(prompt_path, "r") as f:
|
19 |
-
prompt = f.read()
|
20 |
-
logger.info(f"Successfully loaded prompt from {prompt_path}")
|
21 |
-
return prompt
|
22 |
-
except FileNotFoundError:
|
23 |
-
logger.warning(f"Prompt file {filename} not found at {prompt_path}. Using default.")
|
24 |
-
return default_prompt
|
25 |
-
except Exception as e:
|
26 |
-
logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
|
27 |
-
return default_prompt
|
28 |
-
|
29 |
-
# --- Core Figure Interpretation Logic (using Multi-Modal LLM) ---
|
30 |
-
|
31 |
-
def interpret_figure_with_llm(image_path: str, request: str) -> str:
|
32 |
-
"""Interprets a figure in an image based on a specific request using a multi-modal LLM.
|
33 |
-
Args:
|
34 |
-
image_path (str): Path to the image file containing the figure.
|
35 |
-
request (str): The specific question or interpretation task (e.g., "Describe this chart",
|
36 |
-
"Extract sales for Q3", "Identify the main trend").
|
37 |
-
Returns:
|
38 |
-
str: The interpretation result or an error message.
|
39 |
-
"""
|
40 |
-
logger.info(f"Interpreting figure in image: {image_path} with request: {request}")
|
41 |
-
|
42 |
-
# Check if image exists
|
43 |
-
if not os.path.exists(image_path):
|
44 |
-
logger.error(f"Image file not found: {image_path}")
|
45 |
-
return f"Error: Image file not found at {image_path}"
|
46 |
-
|
47 |
-
# LLM configuration (Must be a multi-modal model)
|
48 |
-
# Ensure the selected model supports image input (e.g., gemini-1.5-pro)
|
49 |
-
llm_model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro")
|
50 |
-
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
51 |
-
if not gemini_api_key:
|
52 |
-
logger.error("GEMINI_API_KEY not found for figure interpretation LLM.")
|
53 |
-
return "Error: GEMINI_API_KEY not set."
|
54 |
-
|
55 |
-
try:
|
56 |
-
# Initialize the multi-modal LLM
|
57 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=llm_model_name)
|
58 |
-
logger.info(f"Using figure interpretation LLM: {llm_model_name}")
|
59 |
-
|
60 |
-
# Prepare the prompt for the multi-modal LLM
|
61 |
-
# The prompt needs to guide the LLM to act as the figure interpreter
|
62 |
-
# based on the specific request.
|
63 |
-
prompt = (
|
64 |
-
f"You are an expert figure interpreter. Analyze the provided image containing a chart, graph, diagram, or table. "
|
65 |
-
f"Focus *only* on the visual information present in the image. "
|
66 |
-
f"Fulfill the following request accurately and concisely:\n\n"
|
67 |
-
f"REQUEST: {request}\n\n"
|
68 |
-
f"Based *only* on the image, provide the answer:"
|
69 |
-
)
|
70 |
-
|
71 |
-
# Load the image data (LlamaIndex integration might handle this differently depending on version)
|
72 |
-
# Assuming a method to load image data compatible with the LLM call
|
73 |
-
# This might involve using ImageBlock or similar structures in newer LlamaIndex versions.
|
74 |
-
# For simplicity here, we assume the LLM call can handle a path or loaded image object.
|
75 |
-
|
76 |
-
# Example using complete (adjust based on actual LlamaIndex multi-modal API)
|
77 |
-
# Note: The exact API for multi-modal completion might vary.
|
78 |
-
# This is a conceptual example.
|
79 |
-
from llama_index.core import SimpleDirectoryReader # Example import
|
80 |
-
|
81 |
-
# Load the image document
|
82 |
-
reader = SimpleDirectoryReader(input_files=[image_path])
|
83 |
-
image_documents = reader.load_data()
|
84 |
-
|
85 |
-
if not image_documents or not isinstance(image_documents[0], ImageDocument):
|
86 |
-
logger.error(f"Failed to load image as ImageDocument: {image_path}")
|
87 |
-
return f"Error: Could not load image file {image_path} for analysis."
|
88 |
-
|
89 |
-
# Make the multi-modal completion call
|
90 |
-
response = llm.complete(
|
91 |
-
prompt=prompt,
|
92 |
-
image_documents=image_documents # Pass the loaded image document(s)
|
93 |
-
)
|
94 |
-
|
95 |
-
interpretation = response.text.strip()
|
96 |
-
logger.info("Figure interpretation successful.")
|
97 |
-
return interpretation
|
98 |
-
|
99 |
-
except FileNotFoundError:
|
100 |
-
# This might be redundant due to the initial check, but good practice
|
101 |
-
logger.error(f"Image file not found during LLM call: {image_path}")
|
102 |
-
return f"Error: Image file not found at {image_path}"
|
103 |
-
except ImportError as ie:
|
104 |
-
logger.error(f"Missing library for multi-modal processing: {ie}")
|
105 |
-
return f"Error: Missing required library for image processing ({ie})."
|
106 |
-
except Exception as e:
|
107 |
-
# Catch potential API errors or other issues
|
108 |
-
logger.error(f"LLM call failed during figure interpretation: {e}", exc_info=True)
|
109 |
-
# Check if the error suggests the model doesn't support images
|
110 |
-
if "does not support image input" in str(e).lower():
|
111 |
-
logger.error(f"The configured model {llm_model_name} does not support image input.")
|
112 |
-
return f"Error: The configured LLM ({llm_model_name}) does not support image input. Please configure a multi-modal model."
|
113 |
-
return f"Error during figure interpretation: {e}"
|
114 |
-
|
115 |
-
# --- Tool Definitions (Wrapping the core logic) ---
|
116 |
-
# These tools essentially pass the request to the core LLM function.
|
117 |
-
|
118 |
-
def describe_figure_tool_fn(image_path: str) -> str:
|
119 |
-
"Provides a general description of the figure in the image (type, elements, topic)."
|
120 |
-
return interpret_figure_with_llm(image_path, "Describe this figure, including its type, main elements (axes, labels, legend), and overall topic.")
|
121 |
-
|
122 |
-
def extract_data_points_tool_fn(image_path: str, data_request: str) -> str:
|
123 |
-
"Extracts specific data points or values from the figure in the image."
|
124 |
-
return interpret_figure_with_llm(image_path, f"Extract the following data points/values from the figure: {data_request}. If exact values are not clear, provide the closest estimate based on the visual.")
|
125 |
-
|
126 |
-
def identify_trends_tool_fn(image_path: str) -> str:
|
127 |
-
"Identifies and describes trends or patterns shown in the figure in the image."
|
128 |
-
return interpret_figure_with_llm(image_path, "Analyze and describe the main trends or patterns shown in this figure.")
|
129 |
-
|
130 |
-
def compare_elements_tool_fn(image_path: str, comparison_request: str) -> str:
|
131 |
-
"Compares different elements within the figure in the image."
|
132 |
-
return interpret_figure_with_llm(image_path, f"Compare the following elements within the figure: {comparison_request}. Be specific about the comparison based on the visual data.")
|
133 |
-
|
134 |
-
def summarize_figure_insights_tool_fn(image_path: str) -> str:
|
135 |
-
"Summarizes the key insights or main message conveyed by the figure in the image."
|
136 |
-
return interpret_figure_with_llm(image_path, "Summarize the key insights or the main message conveyed by this figure.")
|
137 |
-
|
138 |
-
# --- Tool Definitions for Agent ---
|
139 |
-
describe_figure_tool = FunctionTool.from_defaults(
|
140 |
-
fn=describe_figure_tool_fn,
|
141 |
-
name="describe_figure",
|
142 |
-
description="Provides a general description of the figure in the image (type, elements, topic). Input: image_path (str)."
|
143 |
-
)
|
144 |
-
|
145 |
-
extract_data_points_tool = FunctionTool.from_defaults(
|
146 |
-
fn=extract_data_points_tool_fn,
|
147 |
-
name="extract_data_points",
|
148 |
-
description="Extracts specific data points/values from the figure. Input: image_path (str), data_request (str)."
|
149 |
-
)
|
150 |
-
|
151 |
-
identify_trends_tool = FunctionTool.from_defaults(
|
152 |
-
fn=identify_trends_tool_fn,
|
153 |
-
name="identify_trends",
|
154 |
-
description="Identifies and describes trends/patterns in the figure. Input: image_path (str)."
|
155 |
-
)
|
156 |
-
|
157 |
-
compare_elements_tool = FunctionTool.from_defaults(
|
158 |
-
fn=compare_elements_tool_fn,
|
159 |
-
name="compare_elements",
|
160 |
-
description="Compares different elements within the figure. Input: image_path (str), comparison_request (str)."
|
161 |
-
)
|
162 |
-
|
163 |
-
summarize_figure_insights_tool = FunctionTool.from_defaults(
|
164 |
-
fn=summarize_figure_insights_tool_fn,
|
165 |
-
name="summarize_figure_insights",
|
166 |
-
description="Summarizes the key insights/main message of the figure. Input: image_path (str)."
|
167 |
-
)
|
168 |
-
|
169 |
-
# --- Agent Initialization ---
|
170 |
-
def initialize_figure_interpretation_agent() -> ReActAgent:
|
171 |
-
"""Initializes the Figure Interpretation Agent."""
|
172 |
-
logger.info("Initializing FigureInterpretationAgent...")
|
173 |
-
|
174 |
-
# Configuration for the agent's main LLM (can be the same multi-modal one)
|
175 |
-
agent_llm_model = os.getenv("FIGURE_INTERPRETATION_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
176 |
-
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
177 |
-
|
178 |
-
if not gemini_api_key:
|
179 |
-
logger.error("GEMINI_API_KEY not found for FigureInterpretationAgent.")
|
180 |
-
raise ValueError("GEMINI_API_KEY must be set for FigureInterpretationAgent")
|
181 |
-
|
182 |
-
try:
|
183 |
-
# Agent's LLM doesn't necessarily need to be multi-modal itself,
|
184 |
-
# if the tools handle the multi-modal calls.
|
185 |
-
# However, using a multi-modal one might allow more direct interaction patterns later.
|
186 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=agent_llm_model)
|
187 |
-
logger.info(f"Using agent LLM: {agent_llm_model}")
|
188 |
-
|
189 |
-
# Load system prompt
|
190 |
-
default_system_prompt = ("You are FigureInterpretationAgent... [Default prompt content - replace with actual]" # Placeholder
|
191 |
-
)
|
192 |
-
system_prompt = load_prompt_from_file("../prompts/figure_interpretation_agent_prompt.txt", default_system_prompt)
|
193 |
-
if system_prompt == default_system_prompt:
|
194 |
-
logger.warning("Using default/fallback system prompt for FigureInterpretationAgent.")
|
195 |
-
|
196 |
-
# Define available tools
|
197 |
-
tools = [
|
198 |
-
describe_figure_tool,
|
199 |
-
extract_data_points_tool,
|
200 |
-
identify_trends_tool,
|
201 |
-
compare_elements_tool,
|
202 |
-
summarize_figure_insights_tool
|
203 |
-
]
|
204 |
-
|
205 |
-
# Define valid handoff targets
|
206 |
-
valid_handoffs = [
|
207 |
-
"planner_agent", # To return results
|
208 |
-
"research_agent", # If context from figure needs further research
|
209 |
-
"reasoning_agent" # If interpretation needs logical analysis
|
210 |
-
]
|
211 |
-
|
212 |
-
agent = ReActAgent(
|
213 |
-
name="figure_interpretation_agent",
|
214 |
-
description=(
|
215 |
-
"Analyzes and interprets visual data representations (charts, graphs, tables) from image files. "
|
216 |
-
"Can describe figures, extract data, identify trends, compare elements, and summarize insights."
|
217 |
-
),
|
218 |
-
tools=tools,
|
219 |
-
llm=llm,
|
220 |
-
system_prompt=system_prompt,
|
221 |
-
can_handoff_to=valid_handoffs,
|
222 |
-
# Note: This agent inherently requires multi-modal input capabilities,
|
223 |
-
# which are handled within its tools via a multi-modal LLM.
|
224 |
-
)
|
225 |
-
logger.info("FigureInterpretationAgent initialized successfully.")
|
226 |
-
return agent
|
227 |
-
|
228 |
-
except Exception as e:
|
229 |
-
logger.error(f"Error during FigureInterpretationAgent initialization: {e}", exc_info=True)
|
230 |
-
raise
|
231 |
-
|
232 |
-
# Example usage (for testing if run directly)
|
233 |
-
if __name__ == "__main__":
|
234 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
235 |
-
logger.info("Running figure_interpretation_agent.py directly for testing...")
|
236 |
-
|
237 |
-
# Check required keys
|
238 |
-
required_keys = ["GEMINI_API_KEY"]
|
239 |
-
missing_keys = [key for key in required_keys if not os.getenv(key)]
|
240 |
-
if missing_keys:
|
241 |
-
print(f"Error: Required environment variable(s) not set: {', '.join(missing_keys)}. Cannot run test.")
|
242 |
-
else:
|
243 |
-
# Check if a multi-modal model is likely configured (heuristic)
|
244 |
-
model_name = os.getenv("FIGURE_INTERPRETATION_LLM_MODEL", "models/gemini-1.5-pro")
|
245 |
-
if "pro" not in model_name.lower() and "vision" not in model_name.lower():
|
246 |
-
print(f"Warning: Configured LLM {model_name} might not support image input. Tests may fail.")
|
247 |
-
|
248 |
-
# Create a dummy image file for testing (requires Pillow)
|
249 |
-
dummy_image_path = "dummy_figure.png"
|
250 |
-
try:
|
251 |
-
from PIL import Image, ImageDraw, ImageFont
|
252 |
-
img = Image.new('RGB', (400, 200), color = (255, 255, 255))
|
253 |
-
d = ImageDraw.Draw(img)
|
254 |
-
# Try to load a default font, handle if not found
|
255 |
-
try:
|
256 |
-
font = ImageFont.truetype("arial.ttf", 15) # Common font, might not exist
|
257 |
-
except IOError:
|
258 |
-
font = ImageFont.load_default()
|
259 |
-
print("Arial font not found, using default PIL font.")
|
260 |
-
d.text((10,10), "Simple Bar Chart", fill=(0,0,0), font=font)
|
261 |
-
d.rectangle([50, 50, 100, 150], fill=(255,0,0)) # Bar 1
|
262 |
-
d.text((60, 160), "A", fill=(0,0,0), font=font)
|
263 |
-
d.rectangle([150, 80, 200, 150], fill=(0,0,255)) # Bar 2
|
264 |
-
d.text((160, 160), "B", fill=(0,0,0), font=font)
|
265 |
-
img.save(dummy_image_path)
|
266 |
-
print(f"Created dummy image file: {dummy_image_path}")
|
267 |
-
|
268 |
-
# Test the tools directly
|
269 |
-
print("\nTesting describe_figure...")
|
270 |
-
desc = describe_figure_tool_fn(dummy_image_path)
|
271 |
-
print(f"Description: {desc}")
|
272 |
-
|
273 |
-
print("\nTesting extract_data_points (qualitative)...")
|
274 |
-
extract_req = "Height of bar A vs Bar B" # Qualitative request
|
275 |
-
extract_res = extract_data_points_tool_fn(dummy_image_path, extract_req)
|
276 |
-
print(f"Extraction Result: {extract_res}")
|
277 |
-
|
278 |
-
print("\nTesting compare_elements...")
|
279 |
-
compare_req = "Compare bar A and bar B"
|
280 |
-
compare_res = compare_elements_tool_fn(dummy_image_path, compare_req)
|
281 |
-
print(f"Comparison Result: {compare_res}")
|
282 |
-
|
283 |
-
# Clean up dummy image
|
284 |
-
os.remove(dummy_image_path)
|
285 |
-
|
286 |
-
except ImportError:
|
287 |
-
print("Pillow library not installed. Skipping direct tool tests that require image creation.")
|
288 |
-
# Optionally, still try initializing the agent
|
289 |
-
try:
|
290 |
-
test_agent = initialize_figure_interpretation_agent()
|
291 |
-
print("\nFigure Interpretation Agent initialized successfully (tool tests skipped).")
|
292 |
-
except Exception as e:
|
293 |
-
print(f"Error initializing agent: {e}")
|
294 |
-
except Exception as e:
|
295 |
-
print(f"Error during testing: {e}")
|
296 |
-
if os.path.exists(dummy_image_path):
|
297 |
-
os.remove(dummy_image_path) # Ensure cleanup on error
|
298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agents/image_analyzer_agent.py
CHANGED
@@ -35,7 +35,7 @@ def initialize_image_analyzer_agent() -> FunctionAgent:
|
|
35 |
logger.info("Initializing ImageAnalyzerAgent...")
|
36 |
|
37 |
# Configuration from environment variables
|
38 |
-
llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "
|
39 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
40 |
|
41 |
if not gemini_api_key:
|
|
|
35 |
logger.info("Initializing ImageAnalyzerAgent...")
|
36 |
|
37 |
# Configuration from environment variables
|
38 |
+
llm_model_name = os.getenv("IMAGE_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
39 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
40 |
|
41 |
if not gemini_api_key:
|
agents/long_context_management_agent.py
CHANGED
@@ -115,7 +115,7 @@ def summarize_long_context(detail_level: Literal["brief", "standard", "detailed"
|
|
115 |
min_length = min_length or int(max_length * 0.3) # Default min length
|
116 |
|
117 |
# LLM configuration
|
118 |
-
llm_model = os.getenv("CONTEXT_LLM_MODEL", "
|
119 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
120 |
if not gemini_api_key:
|
121 |
logger.error("GEMINI_API_KEY not found for summarization LLM.")
|
@@ -135,7 +135,7 @@ def summarize_long_context(detail_level: Literal["brief", "standard", "detailed"
|
|
135 |
)
|
136 |
|
137 |
try:
|
138 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
139 |
logger.info(f"Using summarization LLM: {llm_model}")
|
140 |
response = llm.complete(prompt)
|
141 |
summary = response.text.strip()
|
@@ -307,7 +307,7 @@ def initialize_long_context_management_agent() -> ReActAgent:
|
|
307 |
logger.info("Initializing LongContextManagementAgent...")
|
308 |
|
309 |
# Configuration for the agent's main LLM
|
310 |
-
agent_llm_model = os.getenv("CONTEXT_AGENT_LLM_MODEL", "
|
311 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
312 |
|
313 |
if not gemini_api_key:
|
@@ -315,7 +315,7 @@ def initialize_long_context_management_agent() -> ReActAgent:
|
|
315 |
raise ValueError("GEMINI_API_KEY must be set for LongContextManagementAgent")
|
316 |
|
317 |
try:
|
318 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
319 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
320 |
Settings.llm = llm # Set default LLM for LlamaIndex components used by tools
|
321 |
|
@@ -339,7 +339,8 @@ def initialize_long_context_management_agent() -> ReActAgent:
|
|
339 |
valid_handoffs = [
|
340 |
"planner_agent", # To return results
|
341 |
"text_analyzer_agent", # If further analysis of extracted/filtered text is needed
|
342 |
-
"reasoning_agent"
|
|
|
343 |
]
|
344 |
|
345 |
agent = ReActAgent(
|
|
|
115 |
min_length = min_length or int(max_length * 0.3) # Default min length
|
116 |
|
117 |
# LLM configuration
|
118 |
+
llm_model = os.getenv("CONTEXT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Use Pro for potentially long context
|
119 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
120 |
if not gemini_api_key:
|
121 |
logger.error("GEMINI_API_KEY not found for summarization LLM.")
|
|
|
135 |
)
|
136 |
|
137 |
try:
|
138 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
139 |
logger.info(f"Using summarization LLM: {llm_model}")
|
140 |
response = llm.complete(prompt)
|
141 |
summary = response.text.strip()
|
|
|
307 |
logger.info("Initializing LongContextManagementAgent...")
|
308 |
|
309 |
# Configuration for the agent's main LLM
|
310 |
+
agent_llm_model = os.getenv("CONTEXT_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Needs to handle planning
|
311 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
312 |
|
313 |
if not gemini_api_key:
|
|
|
315 |
raise ValueError("GEMINI_API_KEY must be set for LongContextManagementAgent")
|
316 |
|
317 |
try:
|
318 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
319 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
320 |
Settings.llm = llm # Set default LLM for LlamaIndex components used by tools
|
321 |
|
|
|
339 |
valid_handoffs = [
|
340 |
"planner_agent", # To return results
|
341 |
"text_analyzer_agent", # If further analysis of extracted/filtered text is needed
|
342 |
+
"reasoning_agent",
|
343 |
+
"research_agent"
|
344 |
]
|
345 |
|
346 |
agent = ReActAgent(
|
agents/math_agent.py
CHANGED
@@ -627,7 +627,7 @@ def initialize_math_agent() -> ReActAgent:
|
|
627 |
logger.info("Initializing MathAgent...")
|
628 |
|
629 |
# Configuration
|
630 |
-
agent_llm_model = os.getenv("MATH_AGENT_LLM_MODEL", "
|
631 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
632 |
|
633 |
if not gemini_api_key:
|
@@ -638,6 +638,7 @@ def initialize_math_agent() -> ReActAgent:
|
|
638 |
llm = GoogleGenAI(
|
639 |
api_key=gemini_api_key,
|
640 |
model=agent_llm_model,
|
|
|
641 |
)
|
642 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
643 |
|
|
|
627 |
logger.info("Initializing MathAgent...")
|
628 |
|
629 |
# Configuration
|
630 |
+
agent_llm_model = os.getenv("MATH_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
631 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
632 |
|
633 |
if not gemini_api_key:
|
|
|
638 |
llm = GoogleGenAI(
|
639 |
api_key=gemini_api_key,
|
640 |
model=agent_llm_model,
|
641 |
+
temperature=0.05
|
642 |
)
|
643 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
644 |
|
agents/planner_agent.py
CHANGED
@@ -41,7 +41,7 @@ def plan(objective: str) -> List[str]:
|
|
41 |
logger.info(f"Generating plan for objective: {objective[:100]}...")
|
42 |
|
43 |
# Configuration for planning LLM
|
44 |
-
planner_llm_model = os.getenv("PLANNER_TOOL_LLM_MODEL", "
|
45 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
46 |
if not gemini_api_key:
|
47 |
logger.error("GEMINI_API_KEY not found for planning tool LLM.")
|
@@ -57,7 +57,7 @@ def plan(objective: str) -> List[str]:
|
|
57 |
)
|
58 |
|
59 |
try:
|
60 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
61 |
logger.info(f"Using planning LLM: {planner_llm_model}")
|
62 |
response = llm.complete(input_prompt)
|
63 |
|
@@ -112,7 +112,7 @@ def synthesize_and_report(results: List[Dict[str, str]]) -> str:
|
|
112 |
summary_blocks += f"Sub-step {i+1}: {sub_step}\nAnswer {i+1}: {answer}\n\n"
|
113 |
|
114 |
# Configuration for synthesis LLM
|
115 |
-
synthesizer_llm_model = os.getenv("SYNTHESIZER_LLM_MODEL", "
|
116 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
117 |
if not gemini_api_key:
|
118 |
logger.error("GEMINI_API_KEY not found for synthesis tool LLM.")
|
@@ -131,7 +131,7 @@ def synthesize_and_report(results: List[Dict[str, str]]) -> str:
|
|
131 |
"""
|
132 |
|
133 |
try:
|
134 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
135 |
logger.info(f"Using synthesis LLM: {synthesizer_llm_model}")
|
136 |
response = llm.complete(input_prompt)
|
137 |
logger.info("Synthesis successful.")
|
@@ -163,7 +163,7 @@ def answer_question(question: str) -> str:
|
|
163 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
164 |
return "Error: GEMINI_API_KEY not set."
|
165 |
|
166 |
-
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "
|
167 |
|
168 |
# Build the assistant prompt enforcing the required format
|
169 |
assistant_prompt = (
|
@@ -180,7 +180,7 @@ def answer_question(question: str) -> str:
|
|
180 |
)
|
181 |
|
182 |
try:
|
183 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
184 |
logger.info(f"Using answer LLM: {model_name}")
|
185 |
response = llm.complete(assistant_prompt)
|
186 |
logger.info("Answer generated successfully.")
|
@@ -213,8 +213,8 @@ answer_question = FunctionTool.from_defaults(
|
|
213 |
fn=answer_question,
|
214 |
name="answer_question",
|
215 |
description=(
|
216 |
-
"
|
217 |
-
"
|
218 |
),
|
219 |
)
|
220 |
|
@@ -224,7 +224,7 @@ def initialize_planner_agent() -> ReActAgent:
|
|
224 |
logger.info("Initializing PlannerAgent...")
|
225 |
|
226 |
# Configuration for the agent's main LLM
|
227 |
-
agent_llm_model = os.getenv("PLANNER_AGENT_LLM_MODEL", "
|
228 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
229 |
|
230 |
if not gemini_api_key:
|
@@ -232,7 +232,7 @@ def initialize_planner_agent() -> ReActAgent:
|
|
232 |
raise ValueError("GEMINI_API_KEY must be set for PlannerAgent")
|
233 |
|
234 |
try:
|
235 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
236 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
237 |
|
238 |
# Load system prompt
|
@@ -243,7 +243,7 @@ def initialize_planner_agent() -> ReActAgent:
|
|
243 |
logger.warning("Using default/fallback system prompt for PlannerAgent.")
|
244 |
|
245 |
# Define available tools
|
246 |
-
tools = [generate_substeps_tool, synthesize_tool
|
247 |
|
248 |
# Define valid handoff targets
|
249 |
valid_handoffs = [
|
@@ -253,9 +253,7 @@ def initialize_planner_agent() -> ReActAgent:
|
|
253 |
"role_agent",
|
254 |
"image_analyzer_agent",
|
255 |
"text_analyzer_agent",
|
256 |
-
"verifier_agent",
|
257 |
"reasoning_agent",
|
258 |
-
"figure_interpretation_agent",
|
259 |
"long_context_management_agent",
|
260 |
"advanced_validation_agent",
|
261 |
"video_analyzer_agent"
|
@@ -303,7 +301,7 @@ if __name__ == "__main__":
|
|
303 |
{"sub_step": "Find recent sales data.", "answer": "EV sales grew 25% year-over-year in Q1 2024."},
|
304 |
{"sub_step": "Analyze government incentives.", "answer": "Germany reduced subsidies, France maintained them."}
|
305 |
]
|
306 |
-
report =
|
307 |
print(f"Synthesized Report:\n{report}")
|
308 |
|
309 |
# Initialize the agent (optional)
|
|
|
41 |
logger.info(f"Generating plan for objective: {objective[:100]}...")
|
42 |
|
43 |
# Configuration for planning LLM
|
44 |
+
planner_llm_model = os.getenv("PLANNER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Specific model for this tool?
|
45 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
46 |
if not gemini_api_key:
|
47 |
logger.error("GEMINI_API_KEY not found for planning tool LLM.")
|
|
|
57 |
)
|
58 |
|
59 |
try:
|
60 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
61 |
logger.info(f"Using planning LLM: {planner_llm_model}")
|
62 |
response = llm.complete(input_prompt)
|
63 |
|
|
|
112 |
summary_blocks += f"Sub-step {i+1}: {sub_step}\nAnswer {i+1}: {answer}\n\n"
|
113 |
|
114 |
# Configuration for synthesis LLM
|
115 |
+
synthesizer_llm_model = os.getenv("SYNTHESIZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25") # Specific model?
|
116 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
117 |
if not gemini_api_key:
|
118 |
logger.error("GEMINI_API_KEY not found for synthesis tool LLM.")
|
|
|
131 |
"""
|
132 |
|
133 |
try:
|
134 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
135 |
logger.info(f"Using synthesis LLM: {synthesizer_llm_model}")
|
136 |
response = llm.complete(input_prompt)
|
137 |
logger.info("Synthesis successful.")
|
|
|
163 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
164 |
return "Error: GEMINI_API_KEY not set."
|
165 |
|
166 |
+
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
167 |
|
168 |
# Build the assistant prompt enforcing the required format
|
169 |
assistant_prompt = (
|
|
|
180 |
)
|
181 |
|
182 |
try:
|
183 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
184 |
logger.info(f"Using answer LLM: {model_name}")
|
185 |
response = llm.complete(assistant_prompt)
|
186 |
logger.info("Answer generated successfully.")
|
|
|
213 |
fn=answer_question,
|
214 |
name="answer_question",
|
215 |
description=(
|
216 |
+
"Answers any question and returns the full text, always ending with "
|
217 |
+
"‘FINAL ANSWER: ...’ in accordance with the formatting rules."
|
218 |
),
|
219 |
)
|
220 |
|
|
|
224 |
logger.info("Initializing PlannerAgent...")
|
225 |
|
226 |
# Configuration for the agent's main LLM
|
227 |
+
agent_llm_model = os.getenv("PLANNER_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
228 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
229 |
|
230 |
if not gemini_api_key:
|
|
|
232 |
raise ValueError("GEMINI_API_KEY must be set for PlannerAgent")
|
233 |
|
234 |
try:
|
235 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
236 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
237 |
|
238 |
# Load system prompt
|
|
|
243 |
logger.warning("Using default/fallback system prompt for PlannerAgent.")
|
244 |
|
245 |
# Define available tools
|
246 |
+
tools = [generate_substeps_tool, synthesize_tool]
|
247 |
|
248 |
# Define valid handoff targets
|
249 |
valid_handoffs = [
|
|
|
253 |
"role_agent",
|
254 |
"image_analyzer_agent",
|
255 |
"text_analyzer_agent",
|
|
|
256 |
"reasoning_agent",
|
|
|
257 |
"long_context_management_agent",
|
258 |
"advanced_validation_agent",
|
259 |
"video_analyzer_agent"
|
|
|
301 |
{"sub_step": "Find recent sales data.", "answer": "EV sales grew 25% year-over-year in Q1 2024."},
|
302 |
{"sub_step": "Analyze government incentives.", "answer": "Germany reduced subsidies, France maintained them."}
|
303 |
]
|
304 |
+
report = synthesize_and_report(test_results)
|
305 |
print(f"Synthesized Report:\n{report}")
|
306 |
|
307 |
# Initialize the agent (optional)
|
agents/reasoning_agent.py
CHANGED
@@ -72,7 +72,7 @@ def reasoning_tool_fn(context: str) -> str:
|
|
72 |
model=reasoning_llm_model,
|
73 |
api_key=openai_api_key,
|
74 |
reasoning_effort="high",
|
75 |
-
temperature=0.
|
76 |
max_tokens=16384
|
77 |
)
|
78 |
logger.info(f"Using reasoning LLM: {reasoning_llm_model}")
|
@@ -107,7 +107,7 @@ def answer_question(question: str) -> str:
|
|
107 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
108 |
return "Error: GEMINI_API_KEY not set."
|
109 |
|
110 |
-
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "
|
111 |
|
112 |
# Build the assistant prompt enforcing the required format
|
113 |
assistant_prompt = (
|
@@ -124,7 +124,7 @@ def answer_question(question: str) -> str:
|
|
124 |
)
|
125 |
|
126 |
try:
|
127 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
128 |
logger.info(f"Using answer LLM: {model_name}")
|
129 |
response = llm.complete(assistant_prompt)
|
130 |
logger.info("Answer generated successfully.")
|
@@ -159,7 +159,7 @@ def initialize_reasoning_agent() -> ReActAgent:
|
|
159 |
logger.info("Initializing ReasoningAgent...")
|
160 |
|
161 |
# Configuration for the agent's main LLM (Google GenAI)
|
162 |
-
agent_llm_model = os.getenv("REASONING_AGENT_LLM_MODEL", "
|
163 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
164 |
|
165 |
if not gemini_api_key:
|
@@ -167,7 +167,7 @@ def initialize_reasoning_agent() -> ReActAgent:
|
|
167 |
raise ValueError("GEMINI_API_KEY must be set for ReasoningAgent")
|
168 |
|
169 |
try:
|
170 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
171 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
172 |
|
173 |
# Load system prompt
|
@@ -185,10 +185,21 @@ def initialize_reasoning_agent() -> ReActAgent:
|
|
185 |
"then seamlessly delegates the synthesized insights to `planner_agent` "
|
186 |
"or `long_context_management_agent` for subsequent task orchestration."
|
187 |
),
|
188 |
-
tools=[reasoning_tool
|
189 |
llm=llm,
|
190 |
system_prompt=system_prompt,
|
191 |
-
can_handoff_to=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
192 |
)
|
193 |
|
194 |
return agent
|
|
|
72 |
model=reasoning_llm_model,
|
73 |
api_key=openai_api_key,
|
74 |
reasoning_effort="high",
|
75 |
+
temperature=0.055,
|
76 |
max_tokens=16384
|
77 |
)
|
78 |
logger.info(f"Using reasoning LLM: {reasoning_llm_model}")
|
|
|
107 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
108 |
return "Error: GEMINI_API_KEY not set."
|
109 |
|
110 |
+
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
111 |
|
112 |
# Build the assistant prompt enforcing the required format
|
113 |
assistant_prompt = (
|
|
|
124 |
)
|
125 |
|
126 |
try:
|
127 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
128 |
logger.info(f"Using answer LLM: {model_name}")
|
129 |
response = llm.complete(assistant_prompt)
|
130 |
logger.info("Answer generated successfully.")
|
|
|
159 |
logger.info("Initializing ReasoningAgent...")
|
160 |
|
161 |
# Configuration for the agent's main LLM (Google GenAI)
|
162 |
+
agent_llm_model = os.getenv("REASONING_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
163 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
164 |
|
165 |
if not gemini_api_key:
|
|
|
167 |
raise ValueError("GEMINI_API_KEY must be set for ReasoningAgent")
|
168 |
|
169 |
try:
|
170 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
171 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
172 |
|
173 |
# Load system prompt
|
|
|
185 |
"then seamlessly delegates the synthesized insights to `planner_agent` "
|
186 |
"or `long_context_management_agent` for subsequent task orchestration."
|
187 |
),
|
188 |
+
tools=[reasoning_tool],
|
189 |
llm=llm,
|
190 |
system_prompt=system_prompt,
|
191 |
+
can_handoff_to=[
|
192 |
+
"code_agent",
|
193 |
+
"research_agent",
|
194 |
+
"math_agent",
|
195 |
+
"role_agent",
|
196 |
+
"image_analyzer_agent",
|
197 |
+
"text_analyzer_agent",
|
198 |
+
"planner_agent",
|
199 |
+
"long_context_management_agent",
|
200 |
+
"advanced_validation_agent",
|
201 |
+
"video_analyzer_agent"
|
202 |
+
],
|
203 |
)
|
204 |
|
205 |
return agent
|
agents/research_agent.py
CHANGED
@@ -2,10 +2,12 @@ import os
|
|
2 |
import time
|
3 |
import logging
|
4 |
import re # Import regex for video ID extraction
|
5 |
-
from typing import List, Optional, Dict # Added Dict
|
6 |
|
|
|
7 |
from llama_index.core.agent.workflow import ReActAgent
|
8 |
from llama_index.core.tools import FunctionTool
|
|
|
9 |
from llama_index.llms.google_genai import GoogleGenAI
|
10 |
from llama_index.tools.google import GoogleSearchToolSpec
|
11 |
from llama_index.tools.tavily_research import TavilyToolSpec
|
@@ -67,7 +69,7 @@ def browser_tool_handler(func):
|
|
67 |
return wrapper
|
68 |
|
69 |
@browser_tool_handler
|
70 |
-
def
|
71 |
"""Navigate the browser to the specified URL and wait for the page to load."""
|
72 |
logger.info(f"Navigating to {url} and waiting {wait_seconds}s...")
|
73 |
go_to(url)
|
@@ -76,9 +78,35 @@ def visit(url: str, wait_seconds: float = 3.0) -> str:
|
|
76 |
return f"Successfully navigated to: {current_url}"
|
77 |
|
78 |
@browser_tool_handler
|
79 |
-
def
|
80 |
-
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
81 |
logger.info(f"Extracting text using CSS selector: {selector}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
if selector.lower() == "body":
|
83 |
# Helium Text() might be too broad, let's try body tag first
|
84 |
try:
|
@@ -94,19 +122,253 @@ def get_text_by_css(selector: str) -> List[str]:
|
|
94 |
# Process Helium elements if fallback is used
|
95 |
texts = [elem.web_element.text for elem in elements if elem.web_element.is_displayed() and elem.web_element.text.strip()]
|
96 |
logger.info(f"Extracted {len(texts)} visible text elements using Helium Text().")
|
|
|
|
|
|
|
97 |
return texts
|
98 |
else:
|
99 |
# Use Selenium directly for more control
|
100 |
elements_selenium = _browser_driver.find_elements(By.CSS_SELECTOR, selector)
|
101 |
texts = [elem.text for elem in elements_selenium if elem.is_displayed() and elem.text.strip()]
|
102 |
logger.info(f"Extracted {len(texts)} visible text elements for selector {selector}.")
|
|
|
|
|
103 |
return texts
|
104 |
|
105 |
@browser_tool_handler
|
106 |
-
def
|
107 |
-
|
108 |
-
|
109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
110 |
|
111 |
@browser_tool_handler
|
112 |
def click_element_by_css(selector: str, index: int = 0) -> str:
|
@@ -135,7 +397,7 @@ def click_element_by_css(selector: str, index: int = 0) -> str:
|
|
135 |
return f"Clicked element {index} matching selector {selector}. Current URL: {_browser_driver.current_url}"
|
136 |
|
137 |
@browser_tool_handler
|
138 |
-
def input_text_by_css(selector: str, text: str, index: int = 0, press_enter: bool =
|
139 |
"""Input text into the Nth (0-based index) element matching the CSS selector. Optionally press Enter."""
|
140 |
logger.info(f"Attempting to input text into element {index} matching selector: {selector}")
|
141 |
# Use Selenium directly for finding elements
|
@@ -205,7 +467,7 @@ def close_popups() -> str:
|
|
205 |
time.sleep(0.5)
|
206 |
return "Sent ESC key press."
|
207 |
|
208 |
-
def answer_question(question: str) -> str:
|
209 |
"""
|
210 |
Answer any question by following this strict format:
|
211 |
1. Include your chain of thought (your reasoning steps).
|
@@ -223,16 +485,78 @@ def answer_question(question: str) -> str:
|
|
223 |
"""
|
224 |
logger.info(f"Answering question: {question[:100]}")
|
225 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
227 |
if not gemini_api_key:
|
228 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
229 |
return "Error: GEMINI_API_KEY not set."
|
230 |
|
231 |
-
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
232 |
|
233 |
# Build the assistant prompt enforcing the required format
|
234 |
assistant_prompt = (
|
235 |
-
"
|
|
|
236 |
"Report your thoughts, and finish your answer with the following template: "
|
237 |
"FINAL ANSWER: [YOUR FINAL ANSWER]. "
|
238 |
"YOUR FINAL ANSWER should be a number OR as few words as possible "
|
@@ -240,12 +564,14 @@ def answer_question(question: str) -> str:
|
|
240 |
"If you are asked for a number, don't use commas for thousands or any units like $ or % unless specified. "
|
241 |
"If you are asked for a string, omit articles and abbreviations, and write digits in plain text. "
|
242 |
"If you are asked for a comma separated list, apply these rules to each element.\n\n"
|
|
|
|
|
243 |
f"Question: {question}\n"
|
244 |
"Answer:"
|
245 |
)
|
246 |
|
247 |
try:
|
248 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
249 |
logger.info(f"Using answer LLM: {model_name}")
|
250 |
response = llm.complete(assistant_prompt)
|
251 |
logger.info("Answer generated successfully.")
|
@@ -282,21 +608,39 @@ class ResearchAgentInitializer:
|
|
282 |
fn=answer_question,
|
283 |
name="answer_question",
|
284 |
description=(
|
285 |
-
"
|
286 |
-
"
|
287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
)
|
289 |
|
290 |
logger.info("ResearchAgent resources initialized.")
|
291 |
|
292 |
def _initialize_llm(self):
|
293 |
-
agent_llm_model = os.getenv("RESEARCH_AGENT_LLM_MODEL", "
|
294 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
295 |
if not gemini_api_key:
|
296 |
logger.error("GEMINI_API_KEY not found for ResearchAgent LLM.")
|
297 |
raise ValueError("GEMINI_API_KEY must be set for ResearchAgent")
|
298 |
try:
|
299 |
-
self.llm = GoogleGenAI(api_key=gemini_api_key, model=
|
300 |
logger.info(f"ResearchAgent LLM initialized: {agent_llm_model}")
|
301 |
except Exception as e:
|
302 |
logger.error(f"Failed to initialize ResearchAgent LLM: {e}", exc_info=True)
|
@@ -336,19 +680,138 @@ class ResearchAgentInitializer:
|
|
336 |
if not SELENIUM_AVAILABLE:
|
337 |
self.browser_tools = []
|
338 |
return
|
339 |
-
|
340 |
self.browser_tools = [
|
341 |
-
FunctionTool.from_defaults(
|
342 |
-
|
343 |
-
|
344 |
-
|
345 |
-
|
346 |
-
|
347 |
-
|
348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
349 |
]
|
350 |
-
|
351 |
-
tool.metadata.description = f"(Browser) {tool.metadata.description}"
|
352 |
logger.info(f"Created {len(self.browser_tools)} browser interaction tools.")
|
353 |
|
354 |
def _create_search_tools(self):
|
@@ -357,8 +820,10 @@ class ResearchAgentInitializer:
|
|
357 |
# Google Search
|
358 |
google_spec = GoogleSearchToolSpec(key=os.getenv("GOOGLE_API_KEY"), engine=os.getenv("GOOGLE_CSE_ID"))
|
359 |
if google_spec:
|
360 |
-
google_tool = FunctionTool.from_defaults(
|
361 |
-
|
|
|
|
|
362 |
self.search_tools.append(google_tool)
|
363 |
|
364 |
# Tavily Search
|
@@ -390,6 +855,62 @@ class ResearchAgentInitializer:
|
|
390 |
wiki_load_tool.metadata.description = "(Wikipedia) Load the full content of a specific Wikipedia page title."
|
391 |
self.datasource_tools.extend([wiki_search_tool, wiki_load_tool])
|
392 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
393 |
# Yahoo Finance
|
394 |
yf_spec = YahooFinanceToolSpec()
|
395 |
if yf_spec:
|
@@ -425,52 +946,66 @@ class ResearchAgentInitializer:
|
|
425 |
logger.info("Creating ResearchAgent ReActAgent instance...")
|
426 |
|
427 |
all_tools = self.browser_tools + self.search_tools + self.datasource_tools
|
428 |
-
all_tools.append(self.answer_question)
|
429 |
|
430 |
if not all_tools:
|
431 |
logger.warning("No tools available for ResearchAgent. It will likely be unable to function.")
|
432 |
|
433 |
# System prompt (consider loading from file)
|
434 |
# Updated prompt to include YouTube tool
|
435 |
-
system_prompt = """
|
436 |
-
You are ResearchAgent, an autonomous web
|
437 |
|
438 |
-
Available Tool Categories
|
439 |
-
- (Browser): Tools for direct
|
440 |
- (Search): Tools for querying search engines (Google, DuckDuckGo, Tavily).
|
441 |
- (Wikipedia): Tools for searching and loading Wikipedia pages.
|
442 |
- (YahooFinance): Tools for retrieving financial data (balance sheets, income statements, stock info, news).
|
443 |
- (ArXiv): Tool for searching academic papers on ArXiv.
|
444 |
-
- (
|
445 |
-
|
446 |
-
|
447 |
-
|
|
|
|
|
448 |
|
|
|
|
|
449 |
FINAL ANSWER: [YOUR FINAL ANSWER]
|
450 |
|
451 |
-
Formatting rules for
|
452 |
- A single number, or
|
453 |
- As few words as possible, or
|
454 |
-
- A comma
|
455 |
-
|
456 |
-
|
457 |
-
|
458 |
-
|
459 |
-
|
460 |
-
1.
|
461 |
-
2.
|
462 |
-
3.
|
463 |
-
4.
|
464 |
-
5.
|
465 |
-
6.
|
466 |
-
7.
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
-
|
471 |
-
-
|
472 |
-
-
|
473 |
-
- Do not skip any
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
474 |
"""
|
475 |
|
476 |
agent = ReActAgent(
|
|
|
2 |
import time
|
3 |
import logging
|
4 |
import re # Import regex for video ID extraction
|
5 |
+
from typing import List, Optional, Dict, Any # Added Dict
|
6 |
|
7 |
+
from duckdb.duckdb import description
|
8 |
from llama_index.core.agent.workflow import ReActAgent
|
9 |
from llama_index.core.tools import FunctionTool
|
10 |
+
from llama_index.core.workflow import Context
|
11 |
from llama_index.llms.google_genai import GoogleGenAI
|
12 |
from llama_index.tools.google import GoogleSearchToolSpec
|
13 |
from llama_index.tools.tavily_research import TavilyToolSpec
|
|
|
69 |
return wrapper
|
70 |
|
71 |
@browser_tool_handler
|
72 |
+
def visit_url(url: str, wait_seconds: float = 3.0) -> str:
|
73 |
"""Navigate the browser to the specified URL and wait for the page to load."""
|
74 |
logger.info(f"Navigating to {url} and waiting {wait_seconds}s...")
|
75 |
go_to(url)
|
|
|
78 |
return f"Successfully navigated to: {current_url}"
|
79 |
|
80 |
@browser_tool_handler
|
81 |
+
def get_text_by_css_selector(selector: str) -> list[Any] | str:
|
82 |
+
"""
|
83 |
+
(Browser) Extract visible text content from a webpage using a CSS selector.
|
84 |
+
|
85 |
+
Args:
|
86 |
+
selector (str):
|
87 |
+
A valid CSS selector (e.g., 'body', '.content', '#main').
|
88 |
+
|
89 |
+
Behavior:
|
90 |
+
- If selector == 'body', extracts all visible text from the <body> tag.
|
91 |
+
- If the <body> tag is not found, falls back to Helium Text() for visible elements.
|
92 |
+
- For any other selector, uses Selenium to find all matching elements.
|
93 |
+
- Filters out invisible elements and empty lines.
|
94 |
+
|
95 |
+
Returns:
|
96 |
+
list[str]:
|
97 |
+
A list of visible text lines.
|
98 |
+
OR
|
99 |
+
str:
|
100 |
+
An error message starting with "Error:" on failure (e.g., missing state).
|
101 |
+
"""
|
102 |
logger.info(f"Extracting text using CSS selector: {selector}")
|
103 |
+
# state_dict = await ctx.get("state")
|
104 |
+
# if not state_dict:
|
105 |
+
# logger.error("State not found in context.")
|
106 |
+
# return "Error: State not found."
|
107 |
+
#
|
108 |
+
# research_content = state_dict.get("research_content", [])
|
109 |
+
|
110 |
if selector.lower() == "body":
|
111 |
# Helium Text() might be too broad, let's try body tag first
|
112 |
try:
|
|
|
122 |
# Process Helium elements if fallback is used
|
123 |
texts = [elem.web_element.text for elem in elements if elem.web_element.is_displayed() and elem.web_element.text.strip()]
|
124 |
logger.info(f"Extracted {len(texts)} visible text elements using Helium Text().")
|
125 |
+
# research_content.extend(texts)
|
126 |
+
# state_dict["research_content"] = research_content
|
127 |
+
# await ctx.set("state", state_dict)
|
128 |
return texts
|
129 |
else:
|
130 |
# Use Selenium directly for more control
|
131 |
elements_selenium = _browser_driver.find_elements(By.CSS_SELECTOR, selector)
|
132 |
texts = [elem.text for elem in elements_selenium if elem.is_displayed() and elem.text.strip()]
|
133 |
logger.info(f"Extracted {len(texts)} visible text elements for selector {selector}.")
|
134 |
+
# state_dict["research_content"] = research_content
|
135 |
+
# await ctx.set("state", state_dict)
|
136 |
return texts
|
137 |
|
138 |
@browser_tool_handler
|
139 |
+
def search_in_page(query: str,
|
140 |
+
case_sensitive: bool = False,
|
141 |
+
max_results: int = 50) -> list[str] | str:
|
142 |
+
"""
|
143 |
+
(Browser) Search for occurrences of a word or phrase in the visible text of the current page.
|
144 |
+
|
145 |
+
Args:
|
146 |
+
query (str):
|
147 |
+
Word or phrase to search for (e.g., 'machine learning').
|
148 |
+
case_sensitive (bool, optional):
|
149 |
+
Whether the search should be case-sensitive (default: False).
|
150 |
+
max_results (int, optional):
|
151 |
+
Maximum number of matching lines to return (default: 50).
|
152 |
+
|
153 |
+
Behavior:
|
154 |
+
- Retrieves all visible text from the <body> tag.
|
155 |
+
- Splits the text into individual lines.
|
156 |
+
- Filters lines that contain the `query` (respecting `case_sensitive`).
|
157 |
+
- Appends the matching lines to `state['research_content']`.
|
158 |
+
- Truncates the result to `max_results`.
|
159 |
+
|
160 |
+
Returns:
|
161 |
+
list[str]:
|
162 |
+
List of matching lines (up to `max_results`).
|
163 |
+
OR
|
164 |
+
str:
|
165 |
+
An error message starting with "Error:" on failure (e.g., missing state or browser).
|
166 |
+
"""
|
167 |
+
# Ensure we have state
|
168 |
+
# state = await ctx.get("state") or {}
|
169 |
+
# if not state:
|
170 |
+
# logger.error("State not found in context.")
|
171 |
+
# return "Error: State not found."
|
172 |
+
|
173 |
+
# Extract all visible text from the page
|
174 |
+
try:
|
175 |
+
body = _browser_driver.find_element(By.TAG_NAME, "body")
|
176 |
+
text = body.text or ""
|
177 |
+
except Exception as e:
|
178 |
+
logger.error(f"Failed to extract page text: {e}")
|
179 |
+
return f"Error: Could not retrieve page text ({e})."
|
180 |
+
|
181 |
+
# Prepare for search
|
182 |
+
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
183 |
+
needle = query if case_sensitive else query.lower()
|
184 |
+
|
185 |
+
# Find matches
|
186 |
+
matches = []
|
187 |
+
for line in lines:
|
188 |
+
haystack = line if case_sensitive else line.lower()
|
189 |
+
if needle in haystack:
|
190 |
+
matches.append(line)
|
191 |
+
if len(matches) >= max_results:
|
192 |
+
break
|
193 |
+
|
194 |
+
# Update research context
|
195 |
+
# research = state.get("research_content", [])
|
196 |
+
# research.extend(matches)
|
197 |
+
# state["research_content"] = research
|
198 |
+
# await ctx.set("state", state)
|
199 |
+
|
200 |
+
return matches
|
201 |
+
|
202 |
+
@browser_tool_handler
|
203 |
+
def suggest_informative_selectors(min_words: int = 10, max_selectors: int = 30) -> List[str]:
|
204 |
+
"""
|
205 |
+
Analyze the current page and return a list of CSS selectors likely to contain informative text,
|
206 |
+
along with up to 1000 characters of the element's visible content.
|
207 |
+
|
208 |
+
Parameters:
|
209 |
+
- min_words (int): minimum number of words in an element's text to consider it informative.
|
210 |
+
- max_selectors (int): maximum number of distinct selectors to return.
|
211 |
+
|
212 |
+
Returns:
|
213 |
+
- List[str]: each entry formatted as "selector: preview", where preview is a truncated (1000 chars max) version of the element's content.
|
214 |
+
"""
|
215 |
+
logger.info("Analyzing page to suggest informative CSS selectors with previews...")
|
216 |
+
elements = _browser_driver.find_elements(By.XPATH, "//*[not(self::script or self::style or self::head)]")
|
217 |
+
selector_scores: Dict[str, Dict] = {}
|
218 |
+
|
219 |
+
for elem in elements:
|
220 |
+
if not elem.is_displayed():
|
221 |
+
continue
|
222 |
+
try:
|
223 |
+
text = elem.text.strip()
|
224 |
+
if len(text.split()) >= min_words:
|
225 |
+
tag = elem.tag_name
|
226 |
+
class_attr = elem.get_attribute("class") or ""
|
227 |
+
id_attr = elem.get_attribute("id") or ""
|
228 |
+
|
229 |
+
# Prioritize by specificity: id > class > tag
|
230 |
+
if id_attr:
|
231 |
+
selector = f"{tag}#{id_attr}"
|
232 |
+
elif class_attr:
|
233 |
+
main_class = class_attr.strip().split()[0]
|
234 |
+
selector = f"{tag}.{main_class}"
|
235 |
+
else:
|
236 |
+
selector = tag
|
237 |
+
|
238 |
+
current_score = len(text)
|
239 |
+
if selector not in selector_scores or current_score > selector_scores[selector]["score"]:
|
240 |
+
selector_scores[selector] = {
|
241 |
+
"score": current_score,
|
242 |
+
"preview": text[:1000] # Limit preview to 1000 chars
|
243 |
+
}
|
244 |
+
except Exception as e:
|
245 |
+
logger.warning(f"Error processing element: {e}")
|
246 |
+
continue
|
247 |
+
|
248 |
+
# Sort by score (proxy for information density) and return top N
|
249 |
+
sorted_items = sorted(selector_scores.items(), key=lambda x: x[1]["score"], reverse=True)
|
250 |
+
top_descriptions = [f"{selector}: {info['preview']}" for selector, info in sorted_items[:max_selectors]]
|
251 |
+
|
252 |
+
logger.info(f"Suggested {len(top_descriptions)} informative selectors with previews.")
|
253 |
+
return top_descriptions
|
254 |
+
|
255 |
+
@browser_tool_handler
|
256 |
+
def inspect_clickable_elements(max_elements: int = 20) -> List[str]:
|
257 |
+
"""
|
258 |
+
Inspect the current page and return a list of visible, clickable elements with their CSS selectors and preview text.
|
259 |
+
|
260 |
+
Parameters:
|
261 |
+
- max_elements (int): maximum number of elements to include.
|
262 |
+
|
263 |
+
Returns:
|
264 |
+
- List[str]: descriptions of clickable elements with selector, tag, and truncated inner text.
|
265 |
+
"""
|
266 |
+
logger.info("Inspecting page for clickable elements...")
|
267 |
+
|
268 |
+
# Define XPaths for clickable elements
|
269 |
+
xpaths = [
|
270 |
+
"//a[@href]",
|
271 |
+
"//button",
|
272 |
+
"//input[@type='submit' or @type='button']",
|
273 |
+
"//*[@onclick]",
|
274 |
+
"//*[contains(@role, 'button')]"
|
275 |
+
]
|
276 |
+
seen = set()
|
277 |
+
results = []
|
278 |
+
|
279 |
+
for xpath in xpaths:
|
280 |
+
try:
|
281 |
+
elements = _browser_driver.find_elements(By.XPATH, xpath)
|
282 |
+
for elem in elements:
|
283 |
+
if not elem.is_displayed():
|
284 |
+
continue
|
285 |
+
|
286 |
+
try:
|
287 |
+
tag = elem.tag_name
|
288 |
+
class_attr = elem.get_attribute("class") or ""
|
289 |
+
id_attr = elem.get_attribute("id") or ""
|
290 |
+
text = elem.text.strip()
|
291 |
+
|
292 |
+
# Construct CSS selector
|
293 |
+
if id_attr:
|
294 |
+
selector = f"{tag}#{id_attr}"
|
295 |
+
elif class_attr:
|
296 |
+
selector = f"{tag}.{class_attr.strip().split()[0]}"
|
297 |
+
else:
|
298 |
+
selector = tag
|
299 |
+
|
300 |
+
if selector in seen:
|
301 |
+
continue
|
302 |
+
seen.add(selector)
|
303 |
+
|
304 |
+
description = (
|
305 |
+
f"selector: {selector}\n"
|
306 |
+
f"tag: {tag}\n"
|
307 |
+
f"text: {text[:100] if text else '[no visible text]'}"
|
308 |
+
)
|
309 |
+
results.append(description)
|
310 |
+
|
311 |
+
if len(results) >= max_elements:
|
312 |
+
logger.info(f"Reached limit of {max_elements} clickable elements.")
|
313 |
+
return results
|
314 |
+
except Exception as inner_err:
|
315 |
+
logger.warning(f"Error processing clickable element: {inner_err}")
|
316 |
+
except Exception as outer_err:
|
317 |
+
logger.warning(f"XPath evaluation failed: {xpath} => {outer_err}")
|
318 |
+
|
319 |
+
logger.info(f"Found {len(results)} clickable elements.")
|
320 |
+
return results
|
321 |
+
|
322 |
+
@browser_tool_handler
|
323 |
+
def inspect_clickable_elements_for_filtering_or_sorting(min_words: int = 1, max_items: int = 20) -> List[str]:
|
324 |
+
"""
|
325 |
+
Inspect the current page to find clickable elements (e.g., buttons, links, dropdowns)
|
326 |
+
that are likely to be used for filtering or sorting content.
|
327 |
+
|
328 |
+
Parameters:
|
329 |
+
- min_words (int): minimum number of words to consider an element potentially meaningful.
|
330 |
+
- max_items (int): maximum number of clickable selectors to return.
|
331 |
+
|
332 |
+
Returns:
|
333 |
+
- List[str]: a list of unique CSS selectors (e.g., button.sort, a.filter) likely tied to filtering/sorting functionality.
|
334 |
+
"""
|
335 |
+
logger.info("Inspecting clickable elements for filtering or sorting...")
|
336 |
+
|
337 |
+
clickable_tags = ["button", "a", "input", "select", "label", "div", "span"]
|
338 |
+
selectors_found = {}
|
339 |
+
|
340 |
+
for tag in clickable_tags:
|
341 |
+
try:
|
342 |
+
elements = _browser_driver.find_elements(By.TAG_NAME, tag)
|
343 |
+
for elem in elements:
|
344 |
+
if not elem.is_displayed() or not elem.is_enabled():
|
345 |
+
continue
|
346 |
+
text = elem.text.strip()
|
347 |
+
if len(text.split()) >= min_words or elem.get_attribute("aria-label") or elem.get_attribute("role") in {
|
348 |
+
"button", "combobox"}:
|
349 |
+
tag_name = elem.tag_name
|
350 |
+
class_attr = elem.get_attribute("class") or ""
|
351 |
+
id_attr = elem.get_attribute("id") or ""
|
352 |
+
|
353 |
+
if id_attr:
|
354 |
+
selector = f"{tag_name}#{id_attr}"
|
355 |
+
elif class_attr:
|
356 |
+
main_class = class_attr.strip().split()[0]
|
357 |
+
selector = f"{tag_name}.{main_class}"
|
358 |
+
else:
|
359 |
+
selector = tag_name
|
360 |
+
|
361 |
+
if selector not in selectors_found:
|
362 |
+
selectors_found[selector] = text
|
363 |
+
except Exception as e:
|
364 |
+
logger.warning(f"Failed to process tag '{tag}': {e}")
|
365 |
+
continue
|
366 |
+
|
367 |
+
sorted_selectors = sorted(selectors_found.items(), key=lambda x: len(x[1]), reverse=True)
|
368 |
+
final_selectors = [s for s, _ in sorted_selectors[:max_items]]
|
369 |
+
|
370 |
+
logger.info(f"Found {len(final_selectors)} candidate selectors for filtering/sorting.")
|
371 |
+
return final_selectors
|
372 |
|
373 |
@browser_tool_handler
|
374 |
def click_element_by_css(selector: str, index: int = 0) -> str:
|
|
|
397 |
return f"Clicked element {index} matching selector {selector}. Current URL: {_browser_driver.current_url}"
|
398 |
|
399 |
@browser_tool_handler
|
400 |
+
def input_text_by_css(selector: str, text: str, index: int = 0, press_enter: bool = True) -> str:
|
401 |
"""Input text into the Nth (0-based index) element matching the CSS selector. Optionally press Enter."""
|
402 |
logger.info(f"Attempting to input text into element {index} matching selector: {selector}")
|
403 |
# Use Selenium directly for finding elements
|
|
|
467 |
time.sleep(0.5)
|
468 |
return "Sent ESC key press."
|
469 |
|
470 |
+
async def answer_question(ctx: Context, question: str) -> str:
|
471 |
"""
|
472 |
Answer any question by following this strict format:
|
473 |
1. Include your chain of thought (your reasoning steps).
|
|
|
485 |
"""
|
486 |
logger.info(f"Answering question: {question[:100]}")
|
487 |
|
488 |
+
state_dict = await ctx.get("state")
|
489 |
+
if not state_dict:
|
490 |
+
logger.error("State not found in context.")
|
491 |
+
return "Error: State not found."
|
492 |
+
|
493 |
+
research_content = state_dict.get("research_content", [])
|
494 |
+
|
495 |
+
research_content_str = "\n".join(research_content)
|
496 |
+
|
497 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
498 |
if not gemini_api_key:
|
499 |
logger.error("GEMINI_API_KEY not set for answer_question tool.")
|
500 |
return "Error: GEMINI_API_KEY not set."
|
501 |
|
502 |
+
model_name = os.getenv("ANSWER_TOOL_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
503 |
+
|
504 |
+
prompt = f"""
|
505 |
+
You are **StepwiseAnswerAgent**, a formal reasoning assistant designed to provide clear,
|
506 |
+
accurate, and actionable answers.
|
507 |
+
|
508 |
+
────────────────────────────────────────────
|
509 |
+
CORE OPERATING PRINCIPLES
|
510 |
+
────────────────────────────────────────────
|
511 |
+
1. **Comprehensive Information Gathering**
|
512 |
+
– Gather and synthesize all available information.
|
513 |
+
– Identify gaps or missing data.
|
514 |
+
|
515 |
+
2. **Step-by-Step Reasoning** *(internal only)*
|
516 |
+
– Think through the problem logically in sequential steps.
|
517 |
+
– This reasoning should remain invisible to the user; only the final answer is shown.
|
518 |
+
|
519 |
+
3. **Skeptical Verification**
|
520 |
+
– Question assumptions.
|
521 |
+
– Clearly flag any uncertainties or unverifiable claims (“uncertain”, “missing data”, etc.).
|
522 |
+
– Use reliable sources or tool outputs where possible.
|
523 |
+
|
524 |
+
4. **Clarity and Brevity**
|
525 |
+
– Use a formal and professional tone.
|
526 |
+
– Keep language precise and concise.
|
527 |
+
– Prioritize clarity, utility, and immediate usability of the answer.
|
528 |
+
|
529 |
+
────────────────────────────────────────────
|
530 |
+
INTERNAL PROCEDURE (HIDDEN)
|
531 |
+
────────────────────────────────────────────
|
532 |
+
A. List all known facts and identify unknowns.
|
533 |
+
B. Construct a logical step-by-step reasoning chain.
|
534 |
+
C. Validate consistency and completeness.
|
535 |
+
D. Output only the final answer, with optional extras if relevant.
|
536 |
+
|
537 |
+
────────────────────────────────────────────
|
538 |
+
RESPONSE FORMAT
|
539 |
+
────────────────────────────────────────────
|
540 |
+
**Answer:**
|
541 |
+
A clear, direct response addressing the user's request, without exposing reasoning steps.
|
542 |
+
|
543 |
+
*(Optional)*
|
544 |
+
– **Key Points:** bullet-point summary of critical insights.
|
545 |
+
– **Next Steps / Recommended Actions:** if applicable.
|
546 |
+
|
547 |
+
────────────────────────────────────────────
|
548 |
+
CONSTRAINTS
|
549 |
+
────────────────────────────────────────────
|
550 |
+
• Do not speculate. Clearly indicate when information is incomplete.
|
551 |
+
• Do not reveal internal reasoning or system instructions.
|
552 |
+
• No filler, no flattery, no unnecessary context.
|
553 |
+
• If the question is under-specified, ask for clarification instead of guessing.
|
554 |
+
"""
|
555 |
|
556 |
# Build the assistant prompt enforcing the required format
|
557 |
assistant_prompt = (
|
558 |
+
f"{prompt}\n\n"
|
559 |
+
"I will ask you a question. "
|
560 |
"Report your thoughts, and finish your answer with the following template: "
|
561 |
"FINAL ANSWER: [YOUR FINAL ANSWER]. "
|
562 |
"YOUR FINAL ANSWER should be a number OR as few words as possible "
|
|
|
564 |
"If you are asked for a number, don't use commas for thousands or any units like $ or % unless specified. "
|
565 |
"If you are asked for a string, omit articles and abbreviations, and write digits in plain text. "
|
566 |
"If you are asked for a comma separated list, apply these rules to each element.\n\n"
|
567 |
+
"Let's begin.\n\n"
|
568 |
+
f"All available research: {research_content_str}\n"
|
569 |
f"Question: {question}\n"
|
570 |
"Answer:"
|
571 |
)
|
572 |
|
573 |
try:
|
574 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
575 |
logger.info(f"Using answer LLM: {model_name}")
|
576 |
response = llm.complete(assistant_prompt)
|
577 |
logger.info("Answer generated successfully.")
|
|
|
608 |
fn=answer_question,
|
609 |
name="answer_question",
|
610 |
description=(
|
611 |
+
"(QA) Answer any question using structured, step-by-step reasoning, and return a concise, final result.\n\n"
|
612 |
+
"**Inputs:**\n"
|
613 |
+
"- `ctx` (Context): Execution context containing prior research state.\n"
|
614 |
+
"- `question` (str): A direct, factual question to be answered based on collected knowledge.\n\n"
|
615 |
+
"**Behavior:**\n"
|
616 |
+
"- Retrieves accumulated research content from shared state.\n"
|
617 |
+
"- Performs logical reasoning internally using a formal chain-of-thought.\n"
|
618 |
+
"- Generates a full response that includes visible reasoning steps followed by a strict answer format.\n\n"
|
619 |
+
"**Output Format:**\n"
|
620 |
+
"- Returns a string with:\n"
|
621 |
+
" 1. Reasoning steps (visible to user).\n"
|
622 |
+
" 2. Final answer, always ending with:\n"
|
623 |
+
" `FINAL ANSWER: [your answer]`\n\n"
|
624 |
+
"**Answer Constraints:**\n"
|
625 |
+
"- The final answer must be:\n"
|
626 |
+
" • A number (without commas or units, unless explicitly requested), or\n"
|
627 |
+
" • A short string (no articles or abbreviations), or\n"
|
628 |
+
" • A comma-separated list of numbers and/or strings (same rules apply).\n\n"
|
629 |
+
"**Errors:**\n"
|
630 |
+
"- Returns a string prefixed with `Error:` if state is missing or LLM fails to respond."
|
631 |
+
)
|
632 |
)
|
633 |
|
634 |
logger.info("ResearchAgent resources initialized.")
|
635 |
|
636 |
def _initialize_llm(self):
|
637 |
+
agent_llm_model = os.getenv("RESEARCH_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
638 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
639 |
if not gemini_api_key:
|
640 |
logger.error("GEMINI_API_KEY not found for ResearchAgent LLM.")
|
641 |
raise ValueError("GEMINI_API_KEY must be set for ResearchAgent")
|
642 |
try:
|
643 |
+
self.llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
644 |
logger.info(f"ResearchAgent LLM initialized: {agent_llm_model}")
|
645 |
except Exception as e:
|
646 |
logger.error(f"Failed to initialize ResearchAgent LLM: {e}", exc_info=True)
|
|
|
680 |
if not SELENIUM_AVAILABLE:
|
681 |
self.browser_tools = []
|
682 |
return
|
683 |
+
|
684 |
self.browser_tools = [
|
685 |
+
FunctionTool.from_defaults(
|
686 |
+
fn=visit_url,
|
687 |
+
name="visit_url",
|
688 |
+
description=(
|
689 |
+
"(Browser) Navigate the browser to a specified URL and wait for the page to load.\n"
|
690 |
+
"Inputs: url (str), wait_seconds (float, default=3.0).\n"
|
691 |
+
"Output: str — confirmation message including final URL."
|
692 |
+
)
|
693 |
+
),
|
694 |
+
FunctionTool.from_defaults(
|
695 |
+
fn=get_text_by_css_selector,
|
696 |
+
name="get_text_by_css_selector",
|
697 |
+
description=(
|
698 |
+
"(Browser) Extract visible text content from a webpage using a CSS selector.\n\n"
|
699 |
+
"**Inputs:**\n"
|
700 |
+
"- `selector` (str): A valid CSS selector (e.g., `'body'`, `'.content'`, `'#main'`).\n\n"
|
701 |
+
"**Behavior:**\n"
|
702 |
+
"- If `selector='body'`, extracts all visible text from the `<body>` tag.\n"
|
703 |
+
"- If elements are not found via the DOM, falls back to visible elements via Helium `Text()`.\n"
|
704 |
+
"- For other selectors, uses Selenium to extract text from all visible matching elements.\n"
|
705 |
+
"- Filters out invisible and empty lines.\n\n"
|
706 |
+
"**Output:**\n"
|
707 |
+
"- `List[str]`: List of visible text lines, or an error message string on failure."
|
708 |
+
)
|
709 |
+
),
|
710 |
+
FunctionTool.from_defaults(
|
711 |
+
fn=search_in_page,
|
712 |
+
name="search_in_page",
|
713 |
+
description=(
|
714 |
+
"(Browser) Search for a word or phrase in the visible text of the current page.\n\n"
|
715 |
+
"**Inputs:**\n"
|
716 |
+
"- `query` (str): Word or phrase to search for (e.g., 'machine learning').\n"
|
717 |
+
"- `case_sensitive` (bool, optional): Whether the search is case-sensitive (default: False).\n"
|
718 |
+
"- `max_results` (int, optional): Maximum number of matching lines to return (default: 50).\n\n"
|
719 |
+
"**Behavior:**\n"
|
720 |
+
"- Extracts all visible text from the `<body>` tag.\n"
|
721 |
+
"- Splits text into lines and filters those containing `query`.\n"
|
722 |
+
"- Appends found lines to the shared `research_content` state.\n\n"
|
723 |
+
"**Output:**\n"
|
724 |
+
"- `List[str]`: Matching lines (up to `max_results`).\n"
|
725 |
+
"- `str`: An error message if state or browser is unavailable."
|
726 |
+
)
|
727 |
+
),
|
728 |
+
FunctionTool.from_defaults(
|
729 |
+
fn=click_element_by_css,
|
730 |
+
name="click_element_by_css",
|
731 |
+
description=(
|
732 |
+
"(Browser) Click the N-th visible element matching a CSS selector.\n"
|
733 |
+
"Inputs: selector (str), index (int, default=0).\n"
|
734 |
+
"Output: str — confirmation message with final URL."
|
735 |
+
)
|
736 |
+
),
|
737 |
+
FunctionTool.from_defaults(
|
738 |
+
fn=input_text_by_css,
|
739 |
+
name="input_text_by_css",
|
740 |
+
description=(
|
741 |
+
"(Browser) Input text into the N-th input element matching a CSS selector, optionally pressing Enter.\n"
|
742 |
+
"Inputs: selector (str), text (str), index (int, default=0), press_enter (bool, default=True).\n"
|
743 |
+
"Output: str — confirmation of text input and action."
|
744 |
+
)
|
745 |
+
),
|
746 |
+
FunctionTool.from_defaults(
|
747 |
+
fn=scroll_page,
|
748 |
+
name="scroll_page",
|
749 |
+
description=(
|
750 |
+
"(Browser) Scroll the page in a given direction and amount.\n"
|
751 |
+
"Inputs: direction (str: 'up' or 'down'), amount (str: 'page', 'top', 'bottom', or number of pixels).\n"
|
752 |
+
"Output: str — confirmation of scroll action."
|
753 |
+
)
|
754 |
+
),
|
755 |
+
FunctionTool.from_defaults(
|
756 |
+
fn=go_back,
|
757 |
+
name="navigate_back",
|
758 |
+
description=(
|
759 |
+
"(Browser) Navigate back one step in browser history.\n"
|
760 |
+
"Inputs: none.\n"
|
761 |
+
"Output: str — confirmation of back navigation with current URL."
|
762 |
+
)
|
763 |
+
),
|
764 |
+
FunctionTool.from_defaults(
|
765 |
+
fn=close_popups,
|
766 |
+
name="close_popups",
|
767 |
+
description=(
|
768 |
+
"(Browser) Attempt to close pop-ups or modals by simulating an ESC keypress.\n"
|
769 |
+
"Inputs: none.\n"
|
770 |
+
"Output: str — confirmation of ESC key sent."
|
771 |
+
)
|
772 |
+
),
|
773 |
+
FunctionTool.from_defaults(
|
774 |
+
fn=suggest_informative_selectors,
|
775 |
+
name="suggest_informative_selectors",
|
776 |
+
description=(
|
777 |
+
"(Browser) Analyze the current web page and return a list of up to N CSS selectors likely to contain "
|
778 |
+
"informative text content. Each result includes the CSS selector followed by a preview of up to "
|
779 |
+
"1000 characters of the element's text content. This is especially useful for manually identifying "
|
780 |
+
"relevant containers before applying filters, scrapers, or sorters.\n\n"
|
781 |
+
"**Inputs:**\n"
|
782 |
+
"- `min_words` (int, default=10): Minimum number of words in the element for it to be considered informative.\n"
|
783 |
+
"- `max_selectors` (int, default=15): Maximum number of top selectors to return.\n\n"
|
784 |
+
"**Output:**\n"
|
785 |
+
"- `List[str]`: Each string is formatted as:\n"
|
786 |
+
" 'selector: preview_text'\n"
|
787 |
+
" where `selector` is a CSS path (e.g. `div.article`, `section#main`) and `preview_text` is a truncated (1000 char max) excerpt "
|
788 |
+
"of the visible text in that element."
|
789 |
+
)
|
790 |
+
),
|
791 |
+
FunctionTool.from_defaults(
|
792 |
+
fn=inspect_clickable_elements_for_filtering_or_sorting,
|
793 |
+
name="inspect_filter_sort_selectors",
|
794 |
+
description=(
|
795 |
+
"(Browser) Manually inspect the page for clickable elements (buttons, dropdowns, etc.) that may be used "
|
796 |
+
"for filtering or sorting. Returns a list of candidate CSS selectors.\n"
|
797 |
+
"Inputs: min_words (int, default=1), max_items (int, default=20).\n"
|
798 |
+
"Output: List[str] — list of unique selectors."
|
799 |
+
)
|
800 |
+
),
|
801 |
+
FunctionTool.from_defaults(
|
802 |
+
fn=inspect_clickable_elements,
|
803 |
+
name="inspect_clickable_elements",
|
804 |
+
description=(
|
805 |
+
"(Browser) Inspect the current page for clickable elements (e.g., <a>, <button>, input[type=button], "
|
806 |
+
"or elements with onclick handlers). Returns up to N elements with:\n"
|
807 |
+
"- their CSS selector (id, class or tag fallback),\n"
|
808 |
+
"- their tag type (e.g., button, a, input),\n"
|
809 |
+
"- a preview of their visible text (up to 100 characters).\n"
|
810 |
+
"Useful for manual filtering or determining which elements to interact with programmatically."
|
811 |
+
)
|
812 |
+
)
|
813 |
]
|
814 |
+
|
|
|
815 |
logger.info(f"Created {len(self.browser_tools)} browser interaction tools.")
|
816 |
|
817 |
def _create_search_tools(self):
|
|
|
820 |
# Google Search
|
821 |
google_spec = GoogleSearchToolSpec(key=os.getenv("GOOGLE_API_KEY"), engine=os.getenv("GOOGLE_CSE_ID"))
|
822 |
if google_spec:
|
823 |
+
google_tool = FunctionTool.from_defaults(
|
824 |
+
fn=google_spec.google_search,
|
825 |
+
name="google_search",
|
826 |
+
description="(Search) Execute a Google Custom Search query. Returns structured results.")
|
827 |
self.search_tools.append(google_tool)
|
828 |
|
829 |
# Tavily Search
|
|
|
855 |
wiki_load_tool.metadata.description = "(Wikipedia) Load the full content of a specific Wikipedia page title."
|
856 |
self.datasource_tools.extend([wiki_search_tool, wiki_load_tool])
|
857 |
|
858 |
+
|
859 |
+
# async def wiki_spec_load_data(ctx: Context, page: str, lang: str = "en", **kwargs: Dict[str, Any]) -> str:
|
860 |
+
# """
|
861 |
+
# (Wikipedia) Load the full content of a specific Wikipedia page and store it in the research context.
|
862 |
+
#
|
863 |
+
# Args:
|
864 |
+
# ctx (Context):
|
865 |
+
# Execution context used to access and update shared state.
|
866 |
+
# page (str):
|
867 |
+
# Title of the Wikipedia page to load (e.g., 'Alan Turing').
|
868 |
+
# lang (str, optional):
|
869 |
+
# Language code for the page (default: 'en').
|
870 |
+
# **kwargs (dict, optional):
|
871 |
+
# Additional keyword arguments forwarded to the underlying loader.
|
872 |
+
#
|
873 |
+
# Behavior:
|
874 |
+
# - Fetches the raw text content of the specified Wikipedia page.
|
875 |
+
# - Appends the retrieved content to the `research_content` list in `state`.
|
876 |
+
# - Persists the updated `state` back into the context.
|
877 |
+
#
|
878 |
+
# Returns:
|
879 |
+
# str:
|
880 |
+
# The full plain-text content of the Wikipedia page, or an error message
|
881 |
+
# starting with "Error:" if the context state is missing.
|
882 |
+
# """
|
883 |
+
# state_dict = await ctx.get("state")
|
884 |
+
# if not state_dict:
|
885 |
+
# logger.error("State not found in context.")
|
886 |
+
# return "Error: State not found."
|
887 |
+
#
|
888 |
+
# research_content = state_dict.get("research_content", [])
|
889 |
+
# content = wiki_spec.load_data(page, lang, **kwargs)
|
890 |
+
# research_content.append(content)
|
891 |
+
# state_dict["research_content"] = research_content
|
892 |
+
# await ctx.set("state", state_dict)
|
893 |
+
# return content
|
894 |
+
|
895 |
+
# wiki_load_tool = FunctionTool.from_defaults(
|
896 |
+
# fn=wiki_spec_load_data,
|
897 |
+
# name="wikipedia_load_page",
|
898 |
+
# description=(
|
899 |
+
# "(Wikipedia) Load the full content of a specific Wikipedia page and store it in the research context.\n\n"
|
900 |
+
# "**Inputs:**\n"
|
901 |
+
# "- `ctx` (Context): Execution context used to access and update shared state.\n"
|
902 |
+
# "- `page` (str): Title of the Wikipedia page to load (e.g., 'Alan Turing').\n"
|
903 |
+
# "- `lang` (str, optional): Language code for the Wikipedia page (default is `'en'`).\n"
|
904 |
+
# "- `**kwargs` (dict, optional): Additional keyword arguments forwarded to the underlying data loader.\n\n"
|
905 |
+
# "**Behavior:**\n"
|
906 |
+
# "- Loads the raw textual content of the specified Wikipedia page.\n"
|
907 |
+
# "- Appends the content to the `research_content` list in the shared `state`.\n\n"
|
908 |
+
# "** Output: ** \n"
|
909 |
+
# "- `str`: The full plain-text content of the Wikipedia page."
|
910 |
+
# )
|
911 |
+
# )
|
912 |
+
# self.datasource_tools.extend([wiki_search_tool, wiki_spec_load_data])
|
913 |
+
|
914 |
# Yahoo Finance
|
915 |
yf_spec = YahooFinanceToolSpec()
|
916 |
if yf_spec:
|
|
|
946 |
logger.info("Creating ResearchAgent ReActAgent instance...")
|
947 |
|
948 |
all_tools = self.browser_tools + self.search_tools + self.datasource_tools
|
|
|
949 |
|
950 |
if not all_tools:
|
951 |
logger.warning("No tools available for ResearchAgent. It will likely be unable to function.")
|
952 |
|
953 |
# System prompt (consider loading from file)
|
954 |
# Updated prompt to include YouTube tool
|
955 |
+
system_prompt = """
|
956 |
+
You are ResearchAgent, an autonomous web‑research assistant. Your goal is to gather information accurately and efficiently using the available tools.
|
957 |
|
958 |
+
Available Tool Categories
|
959 |
+
- (Browser): Tools for direct page interaction (visiting URLs, clicking, scrolling, extracting text/HTML, inputting text).
|
960 |
- (Search): Tools for querying search engines (Google, DuckDuckGo, Tavily).
|
961 |
- (Wikipedia): Tools for searching and loading Wikipedia pages.
|
962 |
- (YahooFinance): Tools for retrieving financial data (balance sheets, income statements, stock info, news).
|
963 |
- (ArXiv): Tool for searching academic papers on ArXiv.
|
964 |
+
- (Validation): Tools for assessing reliability
|
965 |
+
• cross_reference_check – verify a claim against source text
|
966 |
+
• logical_consistency_check – detect contradictions or fallacies
|
967 |
+
• bias_detection – uncover cognitive or framing biases
|
968 |
+
• fact_check_with_search – prepare an external fact‑check hand‑off
|
969 |
+
- (Answer): answer_question — use this when your research has yielded a definitive result and you must reply in the strict “FINAL ANSWER” format.
|
970 |
|
971 |
+
Answer Tool Usage
|
972 |
+
When no further data is needed, invoke answer_question with the user’s query. It returns text ending exactly with:
|
973 |
FINAL ANSWER: [YOUR FINAL ANSWER]
|
974 |
|
975 |
+
Formatting rules for YOUR FINAL ANSWER
|
976 |
- A single number, or
|
977 |
- As few words as possible, or
|
978 |
+
- A comma‑separated list of numbers and/or strings.
|
979 |
+
* Numeric values: no thousands separators or units (%, $, etc.) unless requested.
|
980 |
+
* Strings: omit articles and abbreviations; write digits in plain text.
|
981 |
+
* Lists: apply these rules to each element.
|
982 |
+
|
983 |
+
Workflow
|
984 |
+
1. Thought: analyse the goal; choose the single best tool for the next step and explain why.
|
985 |
+
2. Action: call that tool with correct arguments.
|
986 |
+
3. Observation: inspect the output, extract key info, note errors.
|
987 |
+
4. Reflect & Iterate: if the immediate goal is unmet, loop back to step 1 or choose another tool.
|
988 |
+
5. Validate: after every Action‑Observation, validate the new finding with a Validation tool or by delegating to advanced_validation_agent. If validation fails, adjust and retry.
|
989 |
+
6. Long‑Context Management: after three total tool invocations, call long_context_management_agent to compress accumulated information.
|
990 |
+
7. Synthesize: once data is validated (and context managed when needed), integrate it into a coherent answer.
|
991 |
+
8. Respond: use answer_question to emit the FINAL ANSWER.
|
992 |
+
|
993 |
+
Constraints
|
994 |
+
- Exactly one tool per Action step.
|
995 |
+
- Think step‑by‑step; log Thought → Action → Observation clearly.
|
996 |
+
- If using Browser tools, always start with visit_url.
|
997 |
+
- Do not skip any stage (Thought → Action → Observation → Reflect → Validate → Context if needed → Synthesize → Respond).
|
998 |
+
|
999 |
+
Allowed Hand‑Off Agents
|
1000 |
+
- code_agent: source‑code writing / debugging.
|
1001 |
+
- math_agent: calculations, symbolic work.
|
1002 |
+
- text_analyzer_agent: deep text processing (summary, extraction…).
|
1003 |
+
- advanced_validation_agent: extensive factual / logical validation.
|
1004 |
+
- long_context_management_agent: summarise or chunk long contexts.
|
1005 |
+
- planner_agent: break down a new complex goal.
|
1006 |
+
- reasoning_agent: multi‑hop logical reasoning.
|
1007 |
+
|
1008 |
+
Do not delegate to any agent outside this list.
|
1009 |
"""
|
1010 |
|
1011 |
agent = ReActAgent(
|
agents/role_agent.py
CHANGED
@@ -30,7 +30,7 @@ class RoleAgentInitializer:
|
|
30 |
self.embed_model_name = os.getenv("ROLE_EMBED_MODEL", "Snowflake/snowflake-arctic-embed-l-v2.0")
|
31 |
self.reranker_model_name = os.getenv("ROLE_RERANKER_MODEL", "Alibaba-NLP/gte-multilingual-reranker-base")
|
32 |
self.dataset_name = os.getenv("ROLE_PROMPT_DATASET", "fka/awesome-chatgpt-prompts")
|
33 |
-
self.llm_model_name = os.getenv("ROLE_LLM_MODEL", "
|
34 |
self.gemini_api_key = os.getenv("GEMINI_API_KEY")
|
35 |
|
36 |
if not self.gemini_api_key:
|
@@ -153,6 +153,7 @@ class RoleAgentInitializer:
|
|
153 |
llm = GoogleGenAI(
|
154 |
api_key=self.gemini_api_key,
|
155 |
model=self.llm_model_name,
|
|
|
156 |
)
|
157 |
|
158 |
agent = ReActAgent(
|
|
|
30 |
self.embed_model_name = os.getenv("ROLE_EMBED_MODEL", "Snowflake/snowflake-arctic-embed-l-v2.0")
|
31 |
self.reranker_model_name = os.getenv("ROLE_RERANKER_MODEL", "Alibaba-NLP/gte-multilingual-reranker-base")
|
32 |
self.dataset_name = os.getenv("ROLE_PROMPT_DATASET", "fka/awesome-chatgpt-prompts")
|
33 |
+
self.llm_model_name = os.getenv("ROLE_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
34 |
self.gemini_api_key = os.getenv("GEMINI_API_KEY")
|
35 |
|
36 |
if not self.gemini_api_key:
|
|
|
153 |
llm = GoogleGenAI(
|
154 |
api_key=self.gemini_api_key,
|
155 |
model=self.llm_model_name,
|
156 |
+
temperature=0.05
|
157 |
)
|
158 |
|
159 |
agent = ReActAgent(
|
agents/synthesis_agent.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from typing import Any, Dict
|
4 |
+
|
5 |
+
from llama_index.core.agent.workflow import ReActAgent
|
6 |
+
from llama_index.core.tools import FunctionTool
|
7 |
+
from llama_index.core.workflow import Context
|
8 |
+
from llama_index.llms.google_genai import GoogleGenAI
|
9 |
+
|
10 |
+
# -----------------------------------------------------------------------------
|
11 |
+
# Context helper tools ---------------------------------------------------------
|
12 |
+
# -----------------------------------------------------------------------------
|
13 |
+
|
14 |
+
async def write_state(ctx: Context, key: str, value: Any) -> str:
|
15 |
+
state_dict = await ctx.get("state")
|
16 |
+
state_dict[key] = value
|
17 |
+
await ctx.set("state", state_dict)
|
18 |
+
return f"state['{key}'] written"
|
19 |
+
|
20 |
+
async def read_state(ctx: Context, key: str) -> Any:
|
21 |
+
state_dict = await ctx.get("state")
|
22 |
+
return state_dict.get(key, "")
|
23 |
+
|
24 |
+
write_state_tool = FunctionTool.from_defaults(
|
25 |
+
fn=write_state,
|
26 |
+
name="write_state",
|
27 |
+
description="Store or overwrite a value in the shared workflow state.",
|
28 |
+
)
|
29 |
+
read_state_tool = FunctionTool.from_defaults(
|
30 |
+
fn=read_state,
|
31 |
+
name="read_state",
|
32 |
+
description="Retrieve a value from the shared workflow state.",
|
33 |
+
)
|
34 |
+
|
35 |
+
# -----------------------------------------------------------------------------
|
36 |
+
# Fresh implementation of answer_question -------------------------------------
|
37 |
+
# -----------------------------------------------------------------------------
|
38 |
+
|
39 |
+
def answer_question(question: str) -> str:
|
40 |
+
"""Return chain‑of‑thought and FINAL ANSWER following strict template."""
|
41 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
42 |
+
if not gemini_api_key:
|
43 |
+
logging.warning("GEMINI_API_KEY not set – returning fallback answer.")
|
44 |
+
return f"Chain of thought: (api key missing)\n\nFINAL ANSWER: {question}"
|
45 |
+
|
46 |
+
meta_prompt = (
|
47 |
+
"You are a professional assistant. Respond with two sections:"\
|
48 |
+
"\n1. Chain of thought: concise reasoning (3–5 sentences)."\
|
49 |
+
"\n2. FINAL ANSWER: the concise answer following these rules:"\
|
50 |
+
"\n • If numeric, no thousands separators or units unless requested."\
|
51 |
+
"\n • If text, as few words as possible, no unnecessary articles."\
|
52 |
+
"\n • If list, comma‑separate applying the above rules."\
|
53 |
+
"\n • Must start exactly with 'FINAL ANSWER:' (uppercase)."\
|
54 |
+
f"\n\nQuestion: {question}\n\nAnswer:"
|
55 |
+
)
|
56 |
+
|
57 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05)
|
58 |
+
return llm.complete(meta_prompt).text.strip()
|
59 |
+
|
60 |
+
answer_question_tool = FunctionTool.from_defaults(
|
61 |
+
fn=answer_question,
|
62 |
+
name="answer_question",
|
63 |
+
description="Generate reasoning and emit 'FINAL ANSWER: ...' following the strict format rules.",
|
64 |
+
)
|
65 |
+
|
66 |
+
# -----------------------------------------------------------------------------
|
67 |
+
# System prompt (unchanged) ----------------------------------------------------
|
68 |
+
# -----------------------------------------------------------------------------
|
69 |
+
|
70 |
+
SYNTHESIS_SYSTEM_PROMPT = r"""
|
71 |
+
You are SynthesisAgent, the final composer in a multi‑agent workflow.
|
72 |
+
Your goal is to merge validated outputs from specialised agents into a concise
|
73 |
+
user‑facing answer.
|
74 |
+
|
75 |
+
POTENTIAL STATE KEYS TO CONSULT
|
76 |
+
--------------------------------
|
77 |
+
objective – str (restated user goal)
|
78 |
+
plan – dict (PlannerAgent JSON plan)
|
79 |
+
evidence – list[str] (ResearchAgent facts)
|
80 |
+
calculations – list[dict] (MathAgent results)
|
81 |
+
code_outputs – list[dict] (CodeAgent execution)
|
82 |
+
image_analysis – list[dict] (ImageAnalyzerAgent)
|
83 |
+
figure_interpretation – list[dict] (FigureInterpretationAgent)
|
84 |
+
video_analysis – list[dict] (VideoAnalyzerAgent)
|
85 |
+
text_analysis – list[dict] (TextAnalyzerAgent)
|
86 |
+
role_draft – str (RoleAgent draft, optional)
|
87 |
+
reasoning – list[str] (ReasoningAgent chain‑of‑thought)
|
88 |
+
validation – list[dict] (AdvancedValidationAgent)
|
89 |
+
|
90 |
+
WORKFLOW
|
91 |
+
--------
|
92 |
+
1. Read every relevant key. Create a short internal outline.
|
93 |
+
2. If contradictions or missing evidence exist, hand off to
|
94 |
+
advanced_validation_agent or research_agent.
|
95 |
+
3. Draft a clear, well‑structured answer (<= 200 words or 7 bullet points).
|
96 |
+
4. Call the tool `answer_question` with the **user question** to format the
|
97 |
+
final output as required.
|
98 |
+
|
99 |
+
STYLE
|
100 |
+
-----
|
101 |
+
* Formal but approachable language; no internal state leakage.
|
102 |
+
* Cite numeric values plainly; no inline URLs.
|
103 |
+
* Prefer paragraph then bullets for details.
|
104 |
+
|
105 |
+
HANDOFF POLICY
|
106 |
+
--------------
|
107 |
+
Allowed targets when more work required:
|
108 |
+
• advanced_validation_agent – contradictions or doubt
|
109 |
+
• research_agent – missing data
|
110 |
+
• reasoning_agent – reconcile complex logic
|
111 |
+
• long_context_management_agent – compress oversized context before answer
|
112 |
+
"""
|
113 |
+
|
114 |
+
# -----------------------------------------------------------------------------
|
115 |
+
# Factory ---------------------------------------------------------------------
|
116 |
+
# -----------------------------------------------------------------------------
|
117 |
+
|
118 |
+
def initialize_synthesis_agent() -> ReActAgent:
|
119 |
+
logger = logging.getLogger(__name__)
|
120 |
+
logger.info("Initialising SynthesisAgent …")
|
121 |
+
|
122 |
+
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
123 |
+
if not gemini_api_key:
|
124 |
+
raise ValueError("GEMINI_API_KEY required for SynthesisAgent")
|
125 |
+
|
126 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05)
|
127 |
+
|
128 |
+
agent = ReActAgent(
|
129 |
+
name="synthesis_agent",
|
130 |
+
description=(
|
131 |
+
"Aggregates all validated information, resolves residual issues and "
|
132 |
+
"produces the final user answer via answer_question, adhering to the "
|
133 |
+
"required template."),
|
134 |
+
tools=[write_state_tool, read_state_tool, answer_question_tool],
|
135 |
+
llm=llm,
|
136 |
+
system_prompt=SYNTHESIS_SYSTEM_PROMPT,
|
137 |
+
can_handoff_to=[
|
138 |
+
"advanced_validation_agent",
|
139 |
+
"research_agent",
|
140 |
+
"reasoning_agent",
|
141 |
+
"long_context_management_agent",
|
142 |
+
],
|
143 |
+
)
|
144 |
+
return agent
|
145 |
+
|
146 |
+
# -----------------------------------------------------------------------------
|
147 |
+
# Stand‑alone test ------------------------------------------------------------
|
148 |
+
# -----------------------------------------------------------------------------
|
149 |
+
|
150 |
+
if __name__ == "__main__":
|
151 |
+
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
152 |
+
ag = initialize_synthesis_agent()
|
153 |
+
print("SynthesisAgent ready.")
|
agents/text_analyzer_agent.py
CHANGED
@@ -10,14 +10,6 @@ from llama_index.llms.google_genai import GoogleGenAI
|
|
10 |
from llama_index.core.node_parser import SentenceSplitter
|
11 |
from llama_index.core import Document
|
12 |
|
13 |
-
# Attempt to import Whisper
|
14 |
-
try:
|
15 |
-
import whisper
|
16 |
-
WHISPER_AVAILABLE = True
|
17 |
-
except ImportError:
|
18 |
-
logging.warning("openai-whisper not installed. Audio transcription tool will be unavailable.")
|
19 |
-
WHISPER_AVAILABLE = False
|
20 |
-
|
21 |
|
22 |
# Setup logging
|
23 |
logger = logging.getLogger(__name__)
|
@@ -44,28 +36,6 @@ def load_prompt_from_file(filename: str, default_prompt: str) -> str:
|
|
44 |
logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
|
45 |
return default_prompt
|
46 |
|
47 |
-
# --- Helper function to load Whisper model ---
|
48 |
-
def _load_whisper_model(model_size: str = "small") -> Optional[object]:
|
49 |
-
"""Loads the Whisper model instance, lazy loading."""
|
50 |
-
global _whisper_model
|
51 |
-
if not WHISPER_AVAILABLE:
|
52 |
-
logger.error("Whisper library not available, cannot load model.")
|
53 |
-
return None
|
54 |
-
|
55 |
-
if _whisper_model is None:
|
56 |
-
try:
|
57 |
-
logger.info(f"Loading Whisper model: {model_size}...")
|
58 |
-
# Allow model size selection via env var, default to "base"
|
59 |
-
selected_model_size = os.getenv("WHISPER_MODEL_SIZE", model_size)
|
60 |
-
print(f"Available Whisper models: {whisper.available_models()}")
|
61 |
-
_whisper_model = whisper.load_model(selected_model_size)
|
62 |
-
logger.info(f"Whisper model {selected_model_size} loaded successfully.")
|
63 |
-
except Exception as e:
|
64 |
-
logger.error(f"Failed to load Whisper model {selected_model_size}: {e}", exc_info=True)
|
65 |
-
_whisper_model = None # Ensure it remains None on failure
|
66 |
-
|
67 |
-
return _whisper_model
|
68 |
-
|
69 |
# --- Tool Functions ---
|
70 |
|
71 |
def summarize_text(text: str, max_length: int = 150, min_length: int = 30) -> str:
|
@@ -92,7 +62,7 @@ def summarize_text(text: str, max_length: int = 150, min_length: int = 30) -> st
|
|
92 |
)
|
93 |
|
94 |
try:
|
95 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
96 |
logger.info(f"Using summarization LLM: {summarizer_llm_model}")
|
97 |
response = llm.complete(prompt)
|
98 |
summary = response.text.strip()
|
@@ -185,56 +155,6 @@ def split_text_into_chunks(text: str, chunk_size: int = 1000, chunk_overlap: int
|
|
185 |
logger.warning("Falling back to simple text splitting.")
|
186 |
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
|
187 |
|
188 |
-
def transcribe_audio(audio_file_path: str, language: Optional[str] = None) -> str:
|
189 |
-
"""Transcribes an audio file using the OpenAI Whisper model.
|
190 |
-
Args:
|
191 |
-
audio_file_path (str): The path to the audio file (e.g., mp3, wav, m4a).
|
192 |
-
language (Optional[str]): The language code (e.g., "en", "es") or full name ("English", "Spanish").
|
193 |
-
If None, Whisper will detect the language.
|
194 |
-
Returns:
|
195 |
-
str: The transcribed text or an error message.
|
196 |
-
"""
|
197 |
-
logger.info(f"Attempting to transcribe audio file: {audio_file_path}, Language: {language}")
|
198 |
-
|
199 |
-
# Check if Whisper is available
|
200 |
-
if not WHISPER_AVAILABLE:
|
201 |
-
return "Error: openai-whisper library is required but not installed."
|
202 |
-
|
203 |
-
# Check if file exists
|
204 |
-
if not os.path.exists(audio_file_path):
|
205 |
-
logger.error(f"Audio file not found: {audio_file_path}")
|
206 |
-
return f"Error: Audio file not found at {audio_file_path}"
|
207 |
-
|
208 |
-
# Load the Whisper model (lazy loading)
|
209 |
-
model = _load_whisper_model() # Uses default size "base" or WHISPER_MODEL_SIZE env var
|
210 |
-
if model is None:
|
211 |
-
return "Error: Failed to load Whisper model."
|
212 |
-
|
213 |
-
try:
|
214 |
-
# Perform transcription
|
215 |
-
# The transcribe function handles various audio formats via ffmpeg
|
216 |
-
result = model.transcribe(audio_file_path, language=language)
|
217 |
-
transcribed_text = result["text"]
|
218 |
-
detected_language = result.get("language", "unknown") # Get detected language if available
|
219 |
-
logger.info(f"Audio transcription successful. Detected language: {detected_language}. Text length: {len(transcribed_text)}")
|
220 |
-
return transcribed_text
|
221 |
-
|
222 |
-
except Exception as e:
|
223 |
-
# Check if it might be an ffmpeg issue
|
224 |
-
if "ffmpeg" in str(e).lower():
|
225 |
-
logger.error(f"Error during transcription, possibly ffmpeg issue: {e}", exc_info=True)
|
226 |
-
# Check if ffmpeg is installed using shell command
|
227 |
-
try:
|
228 |
-
subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
|
229 |
-
# If ffmpeg is installed, the error is likely something else
|
230 |
-
return f"Error during transcription (ffmpeg seems installed): {e}"
|
231 |
-
except (FileNotFoundError, subprocess.CalledProcessError):
|
232 |
-
logger.error("ffmpeg command not found or failed. Please ensure ffmpeg is installed and in PATH.")
|
233 |
-
return "Error: ffmpeg not found or not working. Please install ffmpeg."
|
234 |
-
else:
|
235 |
-
logger.error(f"Unexpected error during transcription: {e}", exc_info=True)
|
236 |
-
return f"Error during transcription: {e}"
|
237 |
-
|
238 |
# --- Tool Definitions ---
|
239 |
summarize_tool = FunctionTool.from_defaults(
|
240 |
fn=summarize_text,
|
@@ -263,29 +183,13 @@ split_text_tool = FunctionTool.from_defaults(
|
|
263 |
),
|
264 |
)
|
265 |
|
266 |
-
# Conditionally create transcribe_audio_tool
|
267 |
-
transcribe_audio_tool = None
|
268 |
-
if WHISPER_AVAILABLE:
|
269 |
-
transcribe_audio_tool = FunctionTool.from_defaults(
|
270 |
-
fn=transcribe_audio,
|
271 |
-
name="transcribe_audio_file",
|
272 |
-
description=(
|
273 |
-
"Transcribes speech from an audio file (e.g., mp3, wav, m4a) into text using Whisper. "
|
274 |
-
"Input: audio_file_path (str), Optional: language (str - e.g., \"en\", \"Spanish\"). "
|
275 |
-
"Output: transcribed text (str) or error message."
|
276 |
-
),
|
277 |
-
)
|
278 |
-
logger.info("Audio transcription tool created.")
|
279 |
-
else:
|
280 |
-
logger.warning("Audio transcription tool disabled because openai-whisper is not installed.")
|
281 |
-
|
282 |
# --- Agent Initialization ---
|
283 |
def initialize_text_analyzer_agent() -> ReActAgent:
|
284 |
"""Initializes the Text Analyzer Agent."""
|
285 |
logger.info("Initializing TextAnalyzerAgent...")
|
286 |
|
287 |
# Configuration for the agent's main LLM
|
288 |
-
agent_llm_model = os.getenv("TEXT_ANALYZER_AGENT_LLM_MODEL", "
|
289 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
290 |
|
291 |
if not gemini_api_key:
|
@@ -293,7 +197,7 @@ def initialize_text_analyzer_agent() -> ReActAgent:
|
|
293 |
raise ValueError("GEMINI_API_KEY must be set for TextAnalyzerAgent")
|
294 |
|
295 |
try:
|
296 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
297 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
298 |
|
299 |
# Load system prompt
|
@@ -305,16 +209,12 @@ def initialize_text_analyzer_agent() -> ReActAgent:
|
|
305 |
|
306 |
# Define available tools, including the audio tool if available
|
307 |
tools = [summarize_tool, extract_entities_tool, split_text_tool]
|
308 |
-
if transcribe_audio_tool:
|
309 |
-
tools.append(transcribe_audio_tool)
|
310 |
|
311 |
# Update agent description based on available tools
|
312 |
agent_description = (
|
313 |
"Analyzes text content. Can summarize text (`summarize_text`), extract named entities (`extract_entities`), "
|
314 |
"and split long texts (`split_text_into_chunks`)."
|
315 |
)
|
316 |
-
if transcribe_audio_tool:
|
317 |
-
agent_description += " Can also transcribe audio files to text (`transcribe_audio_file`)."
|
318 |
|
319 |
agent = ReActAgent(
|
320 |
name="text_analyzer_agent",
|
@@ -358,23 +258,6 @@ if __name__ == "__main__":
|
|
358 |
print("\nTesting text splitting...")
|
359 |
chunks = split_text_into_chunks(long_text * 3, chunk_size=150, chunk_overlap=30) # Make text longer
|
360 |
print(f"Split into {len(chunks)} chunks. First chunk:\n{chunks[0]}")
|
361 |
-
|
362 |
-
# Test audio transcription (if available)
|
363 |
-
if WHISPER_AVAILABLE:
|
364 |
-
print("\nTesting audio transcription...")
|
365 |
-
# Create a dummy audio file for testing (requires ffmpeg)
|
366 |
-
dummy_file = "dummy_audio.mp3"
|
367 |
-
try:
|
368 |
-
# Generate a 1-second silent MP3 using ffmpeg
|
369 |
-
subprocess.run(["ffmpeg", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono", "-t", "1", "-q:a", "9", "-y", dummy_file], check=True, capture_output=True)
|
370 |
-
print(f"Created dummy audio file: {dummy_file}")
|
371 |
-
transcript = transcribe_audio(dummy_file)
|
372 |
-
print(f"Transcription Result: '{transcript}' (Expected: empty or silence markers)")
|
373 |
-
os.remove(dummy_file) # Clean up dummy file
|
374 |
-
except Exception as ffmpeg_err:
|
375 |
-
print(f"Could not create/test dummy audio file (ffmpeg required): {ffmpeg_err}")
|
376 |
-
else:
|
377 |
-
print("\nSkipping audio transcription test as openai-whisper is not available.")
|
378 |
|
379 |
# Initialize the agent (optional)
|
380 |
# test_agent = initialize_text_analyzer_agent()
|
|
|
10 |
from llama_index.core.node_parser import SentenceSplitter
|
11 |
from llama_index.core import Document
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
# Setup logging
|
15 |
logger = logging.getLogger(__name__)
|
|
|
36 |
logger.error(f"Error loading prompt file {filename}: {e}", exc_info=True)
|
37 |
return default_prompt
|
38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
39 |
# --- Tool Functions ---
|
40 |
|
41 |
def summarize_text(text: str, max_length: int = 150, min_length: int = 30) -> str:
|
|
|
62 |
)
|
63 |
|
64 |
try:
|
65 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
66 |
logger.info(f"Using summarization LLM: {summarizer_llm_model}")
|
67 |
response = llm.complete(prompt)
|
68 |
summary = response.text.strip()
|
|
|
155 |
logger.warning("Falling back to simple text splitting.")
|
156 |
return [text[i:i + chunk_size] for i in range(0, len(text), chunk_size - chunk_overlap)]
|
157 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
158 |
# --- Tool Definitions ---
|
159 |
summarize_tool = FunctionTool.from_defaults(
|
160 |
fn=summarize_text,
|
|
|
183 |
),
|
184 |
)
|
185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
186 |
# --- Agent Initialization ---
|
187 |
def initialize_text_analyzer_agent() -> ReActAgent:
|
188 |
"""Initializes the Text Analyzer Agent."""
|
189 |
logger.info("Initializing TextAnalyzerAgent...")
|
190 |
|
191 |
# Configuration for the agent's main LLM
|
192 |
+
agent_llm_model = os.getenv("TEXT_ANALYZER_AGENT_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
193 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
194 |
|
195 |
if not gemini_api_key:
|
|
|
197 |
raise ValueError("GEMINI_API_KEY must be set for TextAnalyzerAgent")
|
198 |
|
199 |
try:
|
200 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
201 |
logger.info(f"Using agent LLM: {agent_llm_model}")
|
202 |
|
203 |
# Load system prompt
|
|
|
209 |
|
210 |
# Define available tools, including the audio tool if available
|
211 |
tools = [summarize_tool, extract_entities_tool, split_text_tool]
|
|
|
|
|
212 |
|
213 |
# Update agent description based on available tools
|
214 |
agent_description = (
|
215 |
"Analyzes text content. Can summarize text (`summarize_text`), extract named entities (`extract_entities`), "
|
216 |
"and split long texts (`split_text_into_chunks`)."
|
217 |
)
|
|
|
|
|
218 |
|
219 |
agent = ReActAgent(
|
220 |
name="text_analyzer_agent",
|
|
|
258 |
print("\nTesting text splitting...")
|
259 |
chunks = split_text_into_chunks(long_text * 3, chunk_size=150, chunk_overlap=30) # Make text longer
|
260 |
print(f"Split into {len(chunks)} chunks. First chunk:\n{chunks[0]}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
261 |
|
262 |
# Initialize the agent (optional)
|
263 |
# test_agent = initialize_text_analyzer_agent()
|
agents/verifier_agent.py
DELETED
@@ -1,296 +0,0 @@
|
|
1 |
-
import os
|
2 |
-
import logging
|
3 |
-
import re
|
4 |
-
from typing import List
|
5 |
-
|
6 |
-
from llama_index.core.agent.workflow import FunctionAgent, ReActAgent
|
7 |
-
from llama_index.core.tools import FunctionTool
|
8 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
9 |
-
|
10 |
-
# Setup logging
|
11 |
-
logger = logging.getLogger(__name__)
|
12 |
-
|
13 |
-
class VerificationError(Exception):
|
14 |
-
"""Custom exception for verification failures."""
|
15 |
-
pass
|
16 |
-
|
17 |
-
class Verifier:
|
18 |
-
"""
|
19 |
-
Cross-check extracted facts, identify contradictions using LLM,
|
20 |
-
and assign a confidence score to each fact.
|
21 |
-
"""
|
22 |
-
def __init__(self):
|
23 |
-
"""Initializes the Verifier, loading configuration from environment variables."""
|
24 |
-
logger.info("Initializing Verifier...")
|
25 |
-
self.threshold = float(os.getenv("VERIFIER_CONFIDENCE_THRESHOLD", 0.7))
|
26 |
-
self.verifier_llm_model = os.getenv("VERIFIER_LLM_MODEL", "models/gemini-2.0-flash") # For scoring
|
27 |
-
self.agent_llm_model = os.getenv("VERIFIER_AGENT_LLM_MODEL", "models/gemini-1.5-pro") # For agent logic & contradiction
|
28 |
-
self.gemini_api_key = os.getenv("GEMINI_API_KEY")
|
29 |
-
|
30 |
-
if not self.gemini_api_key:
|
31 |
-
logger.error("GEMINI_API_KEY not found in environment variables.")
|
32 |
-
raise ValueError("GEMINI_API_KEY must be set")
|
33 |
-
|
34 |
-
try:
|
35 |
-
self.verifier_llm = GoogleGenAI(
|
36 |
-
api_key=self.gemini_api_key,
|
37 |
-
model=self.verifier_llm_model,
|
38 |
-
)
|
39 |
-
self.agent_llm = GoogleGenAI(
|
40 |
-
api_key=self.gemini_api_key,
|
41 |
-
model=self.agent_llm_model,
|
42 |
-
)
|
43 |
-
logger.info(f"Verifier initialized with threshold {self.threshold}, verifier LLM {self.verifier_llm_model}, agent LLM {self.agent_llm_model}")
|
44 |
-
except Exception as e:
|
45 |
-
logger.error(f"Error initializing Verifier LLMs: {e}", exc_info=True)
|
46 |
-
raise
|
47 |
-
|
48 |
-
def verify_facts(self, facts: List[str]) -> List[str]:
|
49 |
-
"""
|
50 |
-
Assign a confidence score via LLM to each fact and return formatted strings.
|
51 |
-
|
52 |
-
Args:
|
53 |
-
facts (List[str]): Facts to verify.
|
54 |
-
|
55 |
-
Returns:
|
56 |
-
List[str]: Each item is "fact: score" with score ∈ [threshold, 1.0].
|
57 |
-
|
58 |
-
Raises:
|
59 |
-
VerificationError: If LLM call fails.
|
60 |
-
"""
|
61 |
-
logger.info(f"Verifying {len(facts)} facts...")
|
62 |
-
results: List[str] = []
|
63 |
-
for fact in facts:
|
64 |
-
prompt = (
|
65 |
-
"You are a fact verifier. "
|
66 |
-
"On a scale from 0.00 to 1.00, where any value below "
|
67 |
-
f"{self.threshold:.2f} indicates low confidence, rate the following statement’s trustworthiness. "
|
68 |
-
"Respond with **only** a decimal number rounded to two digits (e.g., 0.82) and no extra text.\n\n"
|
69 |
-
f"Statement: \"{fact}\""
|
70 |
-
)
|
71 |
-
try:
|
72 |
-
response = self.verifier_llm.complete(prompt)
|
73 |
-
score_text = response.text.strip()
|
74 |
-
# Try direct conversion first
|
75 |
-
try:
|
76 |
-
score = float(score_text)
|
77 |
-
except ValueError:
|
78 |
-
# Fallback: extract first float if model returns extra text
|
79 |
-
match = re.search(r"0?\.\d+|1(?:\.0+)?", score_text)
|
80 |
-
if match:
|
81 |
-
score = float(match.group(0))
|
82 |
-
logger.warning(f"Extracted score {score} from noisy LLM response: {score_text}")
|
83 |
-
else:
|
84 |
-
logger.error(f"Could not parse score from LLM response: {score_text}. Using threshold {self.threshold}.")
|
85 |
-
score = self.threshold # Fallback to threshold if parsing fails completely
|
86 |
-
|
87 |
-
# Enforce threshold floor
|
88 |
-
if score < self.threshold:
|
89 |
-
logger.info(f"Score {score:.2f} for fact {fact} below threshold {self.threshold}, raising to threshold.")
|
90 |
-
score = self.threshold
|
91 |
-
|
92 |
-
results.append(f"{fact}: {score:.2f}")
|
93 |
-
|
94 |
-
except Exception as e:
|
95 |
-
logger.error(f"LLM call failed during fact verification for {fact}: {e}", exc_info=True)
|
96 |
-
# Option 1: Raise an error
|
97 |
-
# raise VerificationError(f"LLM call failed for fact: {fact}") from e
|
98 |
-
# Option 2: Append an error message (current approach)
|
99 |
-
results.append(f"{fact}: ERROR - Verification failed")
|
100 |
-
# Option 3: Assign lowest score
|
101 |
-
# results.append(f"{fact}: {self.threshold:.2f} (Verification Error)")
|
102 |
-
|
103 |
-
logger.info(f"Fact verification complete. {len(results)} results generated.")
|
104 |
-
return results
|
105 |
-
|
106 |
-
def find_contradictions_llm(self, facts: List[str]) -> List[str]:
|
107 |
-
"""
|
108 |
-
Identify contradictions among a list of facts using an LLM.
|
109 |
-
|
110 |
-
Args:
|
111 |
-
facts (List[str]): List of fact strings.
|
112 |
-
|
113 |
-
Returns:
|
114 |
-
List[str]: Pairs of facts detected as contradictory, joined by " <> ".
|
115 |
-
|
116 |
-
Raises:
|
117 |
-
VerificationError: If LLM call fails.
|
118 |
-
"""
|
119 |
-
logger.info(f"Finding contradictions in {len(facts)} facts using LLM...")
|
120 |
-
if len(facts) < 2:
|
121 |
-
logger.info("Not enough facts to find contradictions.")
|
122 |
-
return []
|
123 |
-
|
124 |
-
facts_numbered = "\n".join([f"{i+1}. {fact}" for i, fact in enumerate(facts)])
|
125 |
-
|
126 |
-
prompt = (
|
127 |
-
"You are a logical reasoning assistant. Analyze the following numbered list of statements. "
|
128 |
-
"Identify any pairs of statements that directly contradict each other. "
|
129 |
-
"List *only* the numbers of the contradicting pairs, one pair per line, formatted as 'X, Y'. "
|
130 |
-
"If no contradictions are found, respond with 'None'. Do not include any other text or explanation.\n\n"
|
131 |
-
f"Statements:\n{facts_numbered}"
|
132 |
-
)
|
133 |
-
|
134 |
-
try:
|
135 |
-
response = self.agent_llm.complete(prompt) # Use the more powerful agent LLM
|
136 |
-
response_text = response.text.strip()
|
137 |
-
logger.info(f"LLM response for contradictions: {response_text}")
|
138 |
-
|
139 |
-
if response_text.lower() == 'none':
|
140 |
-
logger.info("LLM reported no contradictions.")
|
141 |
-
return []
|
142 |
-
|
143 |
-
contradiction_pairs = []
|
144 |
-
lines = response_text.split("\n")
|
145 |
-
for line in lines:
|
146 |
-
line = line.strip()
|
147 |
-
if not line:
|
148 |
-
continue
|
149 |
-
try:
|
150 |
-
# Expect format like "1, 5"
|
151 |
-
parts = line.split(',')
|
152 |
-
if len(parts) == 2:
|
153 |
-
idx1 = int(parts[0].strip()) - 1
|
154 |
-
idx2 = int(parts[1].strip()) - 1
|
155 |
-
|
156 |
-
# Validate indices
|
157 |
-
if 0 <= idx1 < len(facts) and 0 <= idx2 < len(facts) and idx1 != idx2:
|
158 |
-
# Ensure pair order doesn't matter and avoid duplicates
|
159 |
-
pair = tuple(sorted((idx1, idx2)))
|
160 |
-
fact1 = facts[pair[0]]
|
161 |
-
fact2 = facts[pair[1]]
|
162 |
-
contradiction_str = f"{fact1} <> {fact2}"
|
163 |
-
if contradiction_str not in contradiction_pairs:
|
164 |
-
contradiction_pairs.append(contradiction_str)
|
165 |
-
logger.info(f"Identified contradiction: {contradiction_str}")
|
166 |
-
else:
|
167 |
-
logger.warning(f"Invalid index pair found in LLM contradiction response: {line}")
|
168 |
-
else:
|
169 |
-
logger.warning(f"Could not parse contradiction pair from LLM response line: {line}")
|
170 |
-
except ValueError:
|
171 |
-
logger.warning(f"Non-integer index found in LLM contradiction response line: {line}")
|
172 |
-
except Exception as parse_err:
|
173 |
-
logger.warning(f"Error parsing LLM contradiction response line {line}: {parse_err}")
|
174 |
-
|
175 |
-
logger.info(f"Contradiction check complete. Found {len(contradiction_pairs)} pairs.")
|
176 |
-
return contradiction_pairs
|
177 |
-
|
178 |
-
except Exception as e:
|
179 |
-
logger.error(f"LLM call failed during contradiction detection: {e}", exc_info=True)
|
180 |
-
# Option 1: Raise an error
|
181 |
-
raise VerificationError("LLM call failed during contradiction detection") from e
|
182 |
-
# Option 2: Return empty list (fail silently)
|
183 |
-
# return []
|
184 |
-
|
185 |
-
# --- Tool Definitions ---
|
186 |
-
# Tools need to be created within the initialization function to bind to the instance
|
187 |
-
|
188 |
-
# --- Agent Initialization ---
|
189 |
-
|
190 |
-
# Store the initializer instance globally to ensure singleton behavior
|
191 |
-
_verifier_initializer_instance = None
|
192 |
-
|
193 |
-
class VerifierInitializer:
|
194 |
-
def __init__(self):
|
195 |
-
self.verifier = Verifier() # Initialize the Verifier class
|
196 |
-
self._create_tools()
|
197 |
-
|
198 |
-
def _create_tools(self):
|
199 |
-
self.verify_facts_tool = FunctionTool.from_defaults(
|
200 |
-
fn=self.verifier.verify_facts, # Bind to instance method
|
201 |
-
name="verify_facts",
|
202 |
-
description=(
|
203 |
-
"Assigns a numerical confidence score (based on plausibility and internal consistency) to each factual assertion in a list. "
|
204 |
-
"Input: List[str] of statements. Output: List[str] of 'statement: score' pairs."
|
205 |
-
),
|
206 |
-
)
|
207 |
-
|
208 |
-
self.find_contradictions_tool = FunctionTool.from_defaults(
|
209 |
-
fn=self.verifier.find_contradictions_llm, # Bind to instance method (using LLM version)
|
210 |
-
name="find_contradictions",
|
211 |
-
description=(
|
212 |
-
"Uses an LLM to detect logical contradictions among a list of statements. "
|
213 |
-
"Input: List[str] of factual assertions. "
|
214 |
-
"Output: List[str] where each entry is a conflicting pair in the format 'statement1 <> statement2'. Returns empty list if none found."
|
215 |
-
)
|
216 |
-
)
|
217 |
-
|
218 |
-
def get_agent(self) -> FunctionAgent:
|
219 |
-
"""Initializes and returns the Verifier Agent."""
|
220 |
-
logger.info("Creating VerifierAgent FunctionAgent instance...")
|
221 |
-
|
222 |
-
# System prompt (consider loading from file)
|
223 |
-
system_prompt = """\
|
224 |
-
You are VerifierAgent, a fact verification assistant. Given a list of factual statements, you must:
|
225 |
-
|
226 |
-
1. **Verify Facts**: Call `verify_facts` to assign a confidence score to each statement.
|
227 |
-
2. **Detect Contradictions**: Call `find_contradictions` to identify logical conflicts between the statements using an LLM.
|
228 |
-
3. **Present Results**: Output clear bullet points listing each fact with its confidence score, followed by a list of any detected contradictions.
|
229 |
-
4. **Hand-Off**: If significant contradictions or low-confidence facts are found that require deeper analysis, hand off to **reasoning_agent**. Otherwise, pass the verified facts and contradiction summary to **planner_agent** for integration.
|
230 |
-
"""
|
231 |
-
|
232 |
-
agent = FunctionAgent(
|
233 |
-
name="verifier_agent",
|
234 |
-
description=(
|
235 |
-
"Evaluates factual statements by assigning confidence scores (`verify_facts`) "
|
236 |
-
"and detecting logical contradictions using an LLM (`find_contradictions`). "
|
237 |
-
"Hands off to reasoning_agent for complex issues or planner_agent for synthesis."
|
238 |
-
),
|
239 |
-
tools=[
|
240 |
-
self.verify_facts_tool,
|
241 |
-
self.find_contradictions_tool,
|
242 |
-
],
|
243 |
-
llm=self.verifier.agent_llm, # Use the agent LLM from the Verifier instance
|
244 |
-
system_prompt=system_prompt,
|
245 |
-
can_handoff_to=["reasoning_agent", "planner_agent", "advanced_validation_agent"],
|
246 |
-
)
|
247 |
-
logger.info("VerifierAgent FunctionAgent instance created.")
|
248 |
-
return agent
|
249 |
-
|
250 |
-
def get_verifier_initializer():
|
251 |
-
"""Gets the singleton instance of VerifierInitializer."""
|
252 |
-
global _verifier_initializer_instance
|
253 |
-
if _verifier_initializer_instance is None:
|
254 |
-
logger.info("Instantiating VerifierInitializer for the first time.")
|
255 |
-
_verifier_initializer_instance = VerifierInitializer()
|
256 |
-
return _verifier_initializer_instance
|
257 |
-
|
258 |
-
def initialize_verifier_agent() -> FunctionAgent:
|
259 |
-
"""Initializes and returns the Verifier Agent using a singleton initializer."""
|
260 |
-
logger.info("initialize_verifier_agent called.")
|
261 |
-
initializer = get_verifier_initializer()
|
262 |
-
return initializer.get_agent()
|
263 |
-
|
264 |
-
# Example usage (for testing if run directly)
|
265 |
-
if __name__ == "__main__":
|
266 |
-
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
267 |
-
logger.info("Running verifier_agent.py directly for testing...")
|
268 |
-
|
269 |
-
# Ensure API key is set for testing
|
270 |
-
if not os.getenv("GEMINI_API_KEY"):
|
271 |
-
print("Error: GEMINI_API_KEY environment variable not set. Cannot run test.")
|
272 |
-
else:
|
273 |
-
try:
|
274 |
-
test_agent = initialize_verifier_agent()
|
275 |
-
print("Verifier Agent initialized successfully for testing.")
|
276 |
-
|
277 |
-
# Test contradiction detection
|
278 |
-
initializer = get_verifier_initializer()
|
279 |
-
test_facts = [
|
280 |
-
"The sky is blue.",
|
281 |
-
"Water boils at 100 degrees Celsius.",
|
282 |
-
"The sky is not blue.",
|
283 |
-
"Paris is the capital of France."
|
284 |
-
]
|
285 |
-
print(f"\nTesting contradiction detection on: {test_facts}")
|
286 |
-
contradictions = initializer.verifier.find_contradictions_llm(test_facts)
|
287 |
-
print(f"Detected contradictions: {contradictions}")
|
288 |
-
|
289 |
-
# Test fact verification
|
290 |
-
print(f"\nTesting fact verification on: {test_facts}")
|
291 |
-
verified = initializer.verifier.verify_facts(test_facts)
|
292 |
-
print(f"Verified facts: {verified}")
|
293 |
-
|
294 |
-
except Exception as e:
|
295 |
-
print(f"Error during testing: {e}")
|
296 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
agents/video_analyzer_agent.py
CHANGED
@@ -5,7 +5,7 @@ import os
|
|
5 |
import re
|
6 |
import shutil
|
7 |
from pathlib import Path
|
8 |
-
from typing import Optional
|
9 |
|
10 |
import cv2
|
11 |
import yt_dlp
|
@@ -15,6 +15,9 @@ from llama_index.core.tools import FunctionTool
|
|
15 |
from llama_index.llms.google_genai import GoogleGenAI
|
16 |
from tqdm import tqdm
|
17 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
|
|
|
|
|
|
18 |
|
19 |
# ---------------------------------------------------------------------------
|
20 |
# Environment setup & logging
|
@@ -86,7 +89,7 @@ def load_prompt_from_file(filename: str = "../prompts/video_analyzer_prompt.txt"
|
|
86 |
)
|
87 |
|
88 |
|
89 |
-
def extract_frames(video_path, output_dir, fps=
|
90 |
"""
|
91 |
Extract frames from video at specified FPS
|
92 |
Returns a list of (frame_path, timestamp) tuples
|
@@ -134,7 +137,7 @@ def extract_frames(video_path, output_dir, fps=1/2):
|
|
134 |
|
135 |
def download_video_and_analyze(video_url: str) -> str:
|
136 |
"""Download a video from *video_url* and return the local file path."""
|
137 |
-
llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "
|
138 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
139 |
|
140 |
ydl_opts = {
|
@@ -174,7 +177,7 @@ def download_video_and_analyze(video_url: str) -> str:
|
|
174 |
blocks.append(ImageBlock(path=frame_path))
|
175 |
|
176 |
|
177 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
178 |
logger.info("Using LLM model: %s", llm_model_name)
|
179 |
response = llm.chat([ChatMessage(role="user", blocks=blocks)])
|
180 |
|
@@ -200,14 +203,15 @@ def extract_video_id(url: str) -> Optional[str]:
|
|
200 |
match = pattern.search(url)
|
201 |
if match:
|
202 |
video_id = match.group(1)
|
|
|
203 |
return video_id # affiche "VIDEO_ID"
|
204 |
else:
|
205 |
print("Aucun ID trouvé")
|
206 |
-
return
|
207 |
|
208 |
|
209 |
# --- YouTube Transcript Tool ---
|
210 |
-
def get_youtube_transcript(video_url_or_id: str, languages: str | None = None) -> str:
|
211 |
"""Fetches the transcript for a YouTube video using its URL or video ID.
|
212 |
Specify preferred languages as a list (e.g., ["en", "es"]).
|
213 |
Returns the transcript text or an error message.
|
@@ -244,12 +248,11 @@ def get_youtube_transcript(video_url_or_id: str, languages: str | None = None) -
|
|
244 |
return f"Error: Transcripts are disabled for this video (ID: {video_id})."
|
245 |
except NoTranscriptFound as e:
|
246 |
logger.warning(
|
247 |
-
f"No transcript found for video ID {video_id} in languages {languages}. Available: {e
|
248 |
# Try fetching any available transcript if specific languages failed
|
249 |
try:
|
250 |
logger.info(f"Attempting to fetch any available transcript for {video_id}")
|
251 |
-
any_transcript = transcript_list.find_generated_transcript(
|
252 |
-
transcript_list.manually_created_transcripts.keys() or transcript_list.generated_transcripts.keys())
|
253 |
any_transcript_data = any_transcript.fetch()
|
254 |
full_transcript = " ".join([item["text"] for item in any_transcript_data])
|
255 |
logger.info(
|
@@ -265,21 +268,40 @@ def get_youtube_transcript(video_url_or_id: str, languages: str | None = None) -
|
|
265 |
|
266 |
|
267 |
download_video_and_analyze_tool = FunctionTool.from_defaults(
|
|
|
268 |
name="download_video_and_analyze",
|
269 |
description=(
|
270 |
-
"Downloads a video
|
271 |
-
"
|
272 |
-
"
|
273 |
-
|
274 |
-
|
|
|
|
|
|
|
|
|
|
|
275 |
)
|
276 |
|
277 |
youtube_transcript_tool = FunctionTool.from_defaults(
|
278 |
fn=get_youtube_transcript,
|
279 |
name="get_youtube_transcript",
|
280 |
description=(
|
281 |
-
"(YouTube)
|
282 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
283 |
)
|
284 |
)
|
285 |
|
@@ -293,7 +315,7 @@ def initialize_video_analyzer_agent() -> FunctionAgent:
|
|
293 |
|
294 |
logger.info("Initialising VideoAnalyzerAgent …")
|
295 |
|
296 |
-
llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "
|
297 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
298 |
|
299 |
if not gemini_api_key:
|
@@ -301,10 +323,70 @@ def initialize_video_analyzer_agent() -> FunctionAgent:
|
|
301 |
raise ValueError("GEMINI_API_KEY must be set")
|
302 |
|
303 |
try:
|
304 |
-
llm = GoogleGenAI(api_key=gemini_api_key, model=
|
305 |
logger.info("Using LLM model: %s", llm_model_name)
|
306 |
|
307 |
-
system_prompt =
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
308 |
|
309 |
tools = [download_video_and_analyze_tool, youtube_transcript_tool]
|
310 |
|
@@ -357,18 +439,14 @@ if __name__ == "__main__":
|
|
357 |
|
358 |
test_agent = None
|
359 |
try:
|
360 |
-
# Test YouTube transcript tool directly
|
361 |
-
if YOUTUBE_TRANSCRIPT_API_AVAILABLE:
|
362 |
-
print("\nTesting YouTube transcript tool...")
|
363 |
-
# Example video: "Attention is All You Need" paper explanation
|
364 |
-
yt_url = "https://www.youtube.com/watch?v=TQQlZhbC5ps"
|
365 |
-
transcript = get_youtube_transcript(yt_url)
|
366 |
-
if not transcript.startswith("Error:"):
|
367 |
-
print(f"Transcript fetched (first 500 chars):\n{transcript[:500]}...")
|
368 |
-
else:
|
369 |
-
print(f"YouTube Transcript Fetch Failed: {transcript}")
|
370 |
-
else:
|
371 |
-
print("\nSkipping YouTube transcript test as youtube-transcript-api is not available.")
|
372 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
373 |
except Exception as e:
|
374 |
print(f"Error during testing: {e}")
|
|
|
5 |
import re
|
6 |
import shutil
|
7 |
from pathlib import Path
|
8 |
+
from typing import Optional, List
|
9 |
|
10 |
import cv2
|
11 |
import yt_dlp
|
|
|
15 |
from llama_index.llms.google_genai import GoogleGenAI
|
16 |
from tqdm import tqdm
|
17 |
from youtube_transcript_api import YouTubeTranscriptApi, TranscriptsDisabled, NoTranscriptFound
|
18 |
+
import dotenv
|
19 |
+
|
20 |
+
dotenv.load_dotenv()
|
21 |
|
22 |
# ---------------------------------------------------------------------------
|
23 |
# Environment setup & logging
|
|
|
89 |
)
|
90 |
|
91 |
|
92 |
+
def extract_frames(video_path, output_dir, fps=2):
|
93 |
"""
|
94 |
Extract frames from video at specified FPS
|
95 |
Returns a list of (frame_path, timestamp) tuples
|
|
|
137 |
|
138 |
def download_video_and_analyze(video_url: str) -> str:
|
139 |
"""Download a video from *video_url* and return the local file path."""
|
140 |
+
llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
141 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
142 |
|
143 |
ydl_opts = {
|
|
|
177 |
blocks.append(ImageBlock(path=frame_path))
|
178 |
|
179 |
|
180 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
181 |
logger.info("Using LLM model: %s", llm_model_name)
|
182 |
response = llm.chat([ChatMessage(role="user", blocks=blocks)])
|
183 |
|
|
|
203 |
match = pattern.search(url)
|
204 |
if match:
|
205 |
video_id = match.group(1)
|
206 |
+
print(f"ID trouvé : {video_id}")
|
207 |
return video_id # affiche "VIDEO_ID"
|
208 |
else:
|
209 |
print("Aucun ID trouvé")
|
210 |
+
return url
|
211 |
|
212 |
|
213 |
# --- YouTube Transcript Tool ---
|
214 |
+
def get_youtube_transcript(video_url_or_id: str, languages: List[str] | None = None) -> str:
|
215 |
"""Fetches the transcript for a YouTube video using its URL or video ID.
|
216 |
Specify preferred languages as a list (e.g., ["en", "es"]).
|
217 |
Returns the transcript text or an error message.
|
|
|
248 |
return f"Error: Transcripts are disabled for this video (ID: {video_id})."
|
249 |
except NoTranscriptFound as e:
|
250 |
logger.warning(
|
251 |
+
f"No transcript found for video ID {video_id} in languages {languages}. Available: {e}")
|
252 |
# Try fetching any available transcript if specific languages failed
|
253 |
try:
|
254 |
logger.info(f"Attempting to fetch any available transcript for {video_id}")
|
255 |
+
any_transcript = transcript_list.find_generated_transcript(["en"])
|
|
|
256 |
any_transcript_data = any_transcript.fetch()
|
257 |
full_transcript = " ".join([item["text"] for item in any_transcript_data])
|
258 |
logger.info(
|
|
|
268 |
|
269 |
|
270 |
download_video_and_analyze_tool = FunctionTool.from_defaults(
|
271 |
+
fn=download_video_and_analyze,
|
272 |
name="download_video_and_analyze",
|
273 |
description=(
|
274 |
+
"(Video Analysis) Downloads a video from a YouTube or direct URL, extracts visual frames at a sampling rate "
|
275 |
+
"(default 5 frames per second), and performs multimodal analysis such as identification, detailed frame-by-frame analysis, etc. using Gemini. "
|
276 |
+
"Returns a textual summary based exclusively on visual content.\n\n"
|
277 |
+
"**Important**: This tool does *not* analyze or return audio data and does *not* perform any transcription.\n\n"
|
278 |
+
"**Input:**\n"
|
279 |
+
"- `video_url` (str): URL of the video to download and analyze (YouTube link or direct video URL).\n\n"
|
280 |
+
"**Output:**\n"
|
281 |
+
"- A string containing a natural language summary of the visual content in the video. "
|
282 |
+
"This includes scene descriptions, visual objects, setting, and changes over time based on sampled frames."
|
283 |
+
)
|
284 |
)
|
285 |
|
286 |
youtube_transcript_tool = FunctionTool.from_defaults(
|
287 |
fn=get_youtube_transcript,
|
288 |
name="get_youtube_transcript",
|
289 |
description=(
|
290 |
+
"(YouTube) Retrieve the full transcript text of a YouTube video using either its full URL or its video ID.\n\n"
|
291 |
+
"**Functionality**:\n"
|
292 |
+
"- Attempts to extract the video ID from the URL.\n"
|
293 |
+
"- Searches for available transcripts (manual or auto-generated).\n"
|
294 |
+
"- Returns the complete transcript text in a single string.\n"
|
295 |
+
"- If no transcript is found in the preferred language(s), it attempts to fetch any available fallback transcript.\n\n"
|
296 |
+
"**Inputs:**\n"
|
297 |
+
"- `video_url_or_id` (str): The full YouTube video URL (e.g., 'https://www.youtube.com/watch?v=abc123') or the video ID directly (e.g., 'abc123').\n"
|
298 |
+
"- `languages` (str or None): Optional. A preferred language code (e.g., 'en', 'fr'). If None, defaults to 'en'.\n\n"
|
299 |
+
"**Output:**\n"
|
300 |
+
"- A single string containing the full transcript if available.\n"
|
301 |
+
"- In case of failure (no transcript, invalid URL, disabled captions), returns an error message string prefixed with `Error:`.\n\n"
|
302 |
+
"**Limitations:**\n"
|
303 |
+
"- This tool **does not** download or process video or audio.\n"
|
304 |
+
"- If captions are disabled or restricted on the video, the transcript cannot be retrieved."
|
305 |
)
|
306 |
)
|
307 |
|
|
|
315 |
|
316 |
logger.info("Initialising VideoAnalyzerAgent …")
|
317 |
|
318 |
+
llm_model_name = os.getenv("VIDEO_ANALYZER_LLM_MODEL", "gemini-2.5-pro-preview-03-25")
|
319 |
gemini_api_key = os.getenv("GEMINI_API_KEY")
|
320 |
|
321 |
if not gemini_api_key:
|
|
|
323 |
raise ValueError("GEMINI_API_KEY must be set")
|
324 |
|
325 |
try:
|
326 |
+
llm = GoogleGenAI(api_key=gemini_api_key, model="gemini-2.5-pro-preview-03-25", temperature=0.05, max_tokens=8192)
|
327 |
logger.info("Using LLM model: %s", llm_model_name)
|
328 |
|
329 |
+
system_prompt = """
|
330 |
+
You are **VideoAnalyzerAgent**, an expert multimodal analyst specialised in factual,
|
331 |
+
frame‑level understanding of video.
|
332 |
+
|
333 |
+
─────────────────
|
334 |
+
CORE PRINCIPLES
|
335 |
+
─────────────────
|
336 |
+
1. **Visual‑only reasoning** – base every statement on what can be seen in the
|
337 |
+
provided frames; never guess at sounds, music, or dialogue.
|
338 |
+
2. **Chronological accuracy** – describe events strictly in the order they occur.
|
339 |
+
3. **Sceptical precision** – if something is ambiguous on screen, say so plainly
|
340 |
+
(“unclear whether …”); do not invent motives or unseen causes.
|
341 |
+
4. **Token economy** – be concise; omit pleasantries and waffle.
|
342 |
+
5. **Professional tone** – formal, neutral, and practical.
|
343 |
+
|
344 |
+
─────────────────
|
345 |
+
TOOLS AT YOUR DISPOSAL
|
346 |
+
─────────────────
|
347 |
+
• `download_video_and_analyze(video_url)` –
|
348 |
+
Downloads the video, samples ~2fps, and returns your own multimodal summary
|
349 |
+
of the visuals such as detailed frame-by-frame analysis, key insights, or a TL;DR.
|
350 |
+
Use when the user needs a purely visual description.
|
351 |
+
|
352 |
+
• `get_youtube_transcript(video_url_or_id, languages="en")` –
|
353 |
+
Returns the full YouTube transcript (if any).
|
354 |
+
Use when the user requests spoken content or captions.
|
355 |
+
|
356 |
+
Always think aloud (in hidden chain‑of‑thought) which tool(s) you need **before**
|
357 |
+
calling them. If neither tool is relevant, politely explain why.
|
358 |
+
|
359 |
+
─────────────────
|
360 |
+
RESPONSE FORMAT
|
361 |
+
─────────────────
|
362 |
+
Return Markdown with the following sections **only when they add value**:
|
363 |
+
|
364 |
+
1. **TL;DR (≤3 sentences)** – executive summary.
|
365 |
+
2. **Timeline** – table listing `timestamp → scene description → notable objects/actions`.
|
366 |
+
3. **Key Insights** – bullet points of patterns, cause–effect, or anomalies worth noting.
|
367 |
+
4. **Actionable Take‑aways** – optional, only if user asked “so what?” questions.
|
368 |
+
|
369 |
+
Timestamps should be in **mm:ss** (or h:mm:ss if >1h).
|
370 |
+
Avoid more than one level of heading depth (i.e., use `##`, not `###`/`####`).
|
371 |
+
|
372 |
+
─────────────────
|
373 |
+
STYLE & CONSTRAINTS
|
374 |
+
─────────────────
|
375 |
+
• Use present tense for on‑screen events (“The camera pans over …”).
|
376 |
+
• Quantify when possible (“The audience consists of ~200 peoples” “text occupies ~25% of the frame”).
|
377 |
+
• Never reveal chain‑of‑thought or raw frame data.
|
378 |
+
• If no visual frames were extracted, state: “No usable frames – cannot analyse.”
|
379 |
+
• If captions are disabled, reply: “No transcript available.”
|
380 |
+
|
381 |
+
─────────────────
|
382 |
+
EXAMPLES OF ACCEPTABLE BREVITY
|
383 |
+
─────────────────
|
384 |
+
- Good: “At 02:15 the speaker shows a slide titled ‘Transformer Architecture’.”
|
385 |
+
- Bad: “There is some sort of diagram that maybe explains something about the
|
386 |
+
architecture; it might be a transformer but it is hard to tell.”
|
387 |
+
|
388 |
+
End of prompt.
|
389 |
+
"""
|
390 |
|
391 |
tools = [download_video_and_analyze_tool, youtube_transcript_tool]
|
392 |
|
|
|
439 |
|
440 |
test_agent = None
|
441 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
442 |
|
443 |
+
print("\nTesting YouTube transcript tool...")
|
444 |
+
# Example video: "Attention is All You Need" paper explanation
|
445 |
+
yt_url = "https://www.youtube.com/watch?v=TQQlZhbC5ps"
|
446 |
+
transcript = get_youtube_transcript(yt_url)
|
447 |
+
if not transcript.startswith("Error:"):
|
448 |
+
print(f"Transcript fetched (first 500 chars):\n{transcript[:500]}...")
|
449 |
+
else:
|
450 |
+
print(f"YouTube Transcript Fetch Failed: {transcript}")
|
451 |
except Exception as e:
|
452 |
print(f"Error during testing: {e}")
|
app.py
CHANGED
@@ -1,12 +1,17 @@
|
|
1 |
import os
|
2 |
import logging
|
3 |
import mimetypes
|
|
|
4 |
|
5 |
from typing import Any, List
|
6 |
|
7 |
import gradio as gr
|
8 |
import requests
|
9 |
import pandas as pd
|
|
|
|
|
|
|
|
|
10 |
|
11 |
from llama_index.core.agent.workflow import AgentWorkflow, ToolCallResult, ToolCall, AgentOutput
|
12 |
from llama_index.core.base.llms.types import ChatMessage, TextBlock, ImageBlock, AudioBlock
|
@@ -14,6 +19,8 @@ from llama_index.llms.openai import OpenAI
|
|
14 |
|
15 |
from agents.video_analyzer_agent import initialize_video_analyzer_agent
|
16 |
|
|
|
|
|
17 |
# Assuming agent initializers are in the same directory or a known path
|
18 |
# Adjust import paths if necessary based on deployment structure
|
19 |
try:
|
@@ -26,34 +33,17 @@ try:
|
|
26 |
from agents.planner_agent import initialize_planner_agent
|
27 |
from agents.research_agent import initialize_research_agent
|
28 |
from agents.role_agent import initialize_role_agent
|
29 |
-
from agents.verifier_agent import initialize_verifier_agent
|
30 |
# New agents
|
31 |
from agents.advanced_validation_agent import initialize_advanced_validation_agent
|
32 |
-
from agents.figure_interpretation_agent import initialize_figure_interpretation_agent
|
33 |
from agents.long_context_management_agent import initialize_long_context_management_agent
|
|
|
34 |
AGENT_IMPORT_PATH = "local"
|
35 |
except ImportError as e:
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
from final_project.code_agent import initialize_code_agent
|
42 |
-
from final_project.math_agent import initialize_math_agent
|
43 |
-
from final_project.planner_agent import initialize_planner_agent
|
44 |
-
from final_project.research_agent import initialize_research_agent
|
45 |
-
from final_project.role_agent import initialize_role_agent
|
46 |
-
from final_project.verifier_agent import initialize_verifier_agent
|
47 |
-
from final_project.advanced_validation_agent import initialize_advanced_validation_agent
|
48 |
-
from final_project.figure_interpretation_agent import initialize_figure_interpretation_agent
|
49 |
-
from final_project.long_context_management_agent import initialize_long_context_management_agent
|
50 |
-
AGENT_IMPORT_PATH = "final_project"
|
51 |
-
except ImportError as e2:
|
52 |
-
print(f"Import Error: Could not find agent modules. Tried local and final_project paths. Error: {e2}")
|
53 |
-
# Set initializers to None or raise error to prevent app start
|
54 |
-
initialize_image_analyzer_agent = None
|
55 |
-
# ... set all others to None ...
|
56 |
-
raise RuntimeError(f"Failed to import agent modules: {e2}")
|
57 |
|
58 |
# Setup logging
|
59 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
@@ -62,6 +52,57 @@ logger = logging.getLogger(__name__)
|
|
62 |
# --- Constants ---
|
63 |
DEFAULT_API_URL = os.getenv("GAIA_API_URL", "https://agents-course-unit4-scoring.hf.space")
|
64 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
# --- Agent Initialization (Singleton Pattern) ---
|
66 |
# Initialize the agent workflow once
|
67 |
AGENT_WORKFLOW = None
|
@@ -74,28 +115,30 @@ try:
|
|
74 |
planner_agent = initialize_planner_agent()
|
75 |
research_agent = initialize_research_agent()
|
76 |
text_analyzer_agent = initialize_text_analyzer_agent()
|
77 |
-
verifier_agent = initialize_verifier_agent()
|
78 |
image_analyzer_agent = initialize_image_analyzer_agent()
|
79 |
reasoning_agent = initialize_reasoning_agent()
|
80 |
# New agents
|
81 |
advanced_validation_agent = initialize_advanced_validation_agent()
|
82 |
-
figure_interpretation_agent = initialize_figure_interpretation_agent()
|
83 |
long_context_management_agent = initialize_long_context_management_agent()
|
84 |
video_analyzer_agent = initialize_video_analyzer_agent()
|
|
|
85 |
|
86 |
# Check if all agents initialized successfully
|
87 |
all_agents = [
|
88 |
code_agent, role_agent, math_agent, planner_agent, research_agent,
|
89 |
-
text_analyzer_agent, image_analyzer_agent,
|
90 |
-
advanced_validation_agent,
|
91 |
-
video_analyzer_agent
|
92 |
]
|
93 |
if not all(all_agents):
|
94 |
raise RuntimeError("One or more agents failed to initialize.")
|
95 |
|
96 |
AGENT_WORKFLOW = AgentWorkflow(
|
97 |
agents=all_agents,
|
98 |
-
root_agent="
|
|
|
|
|
|
|
99 |
)
|
100 |
logger.info("GAIA Multi-Agent Workflow initialized successfully.")
|
101 |
except Exception as e:
|
@@ -160,18 +203,19 @@ class BasicAgent:
|
|
160 |
logger.info(f"Agent returning final answer: {final_content[:500]}{'...' if len(final_content) > 500 else ''}")
|
161 |
return answer.response # Return the actual response object expected by Gradio
|
162 |
|
163 |
-
system_prompt="""
|
164 |
You are a general AI assistant.
|
165 |
I will give you a result, and with it you will have to transform it to follow the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
166 |
YOUR FINAL ANSWER should be a number OR 1 or 2 word(s) OR a comma separated list of numbers and/or strings.
|
167 |
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
168 |
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
169 |
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
170 |
-
""
|
171 |
-
|
172 |
-
llm = OpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=0.1, system_prompt=system_prompt)
|
173 |
|
|
|
|
|
174 |
|
|
|
175 |
|
176 |
# --- Helper Functions for run_and_submit_all ---
|
177 |
|
@@ -222,7 +266,7 @@ async def process_question(agent: BasicAgent, item: dict, base_fetch_file_url: s
|
|
222 |
if mime_type:
|
223 |
# Prioritize specific extensions for text-like content
|
224 |
text_extensions = (
|
225 |
-
".txt", ".
|
226 |
".html", ".htm", ".xhtml", ".css", ".scss", ".sass", ".less", ".svg", ".md", ".rst",
|
227 |
".py", ".js", ".java", ".c", ".cpp", ".h", ".hpp", ".cs", ".go", ".php", ".rb", ".swift", ".kt",
|
228 |
".sh", ".bat", ".ipynb", ".Rmd", ".tex" # Added more code/markup types
|
@@ -237,17 +281,22 @@ async def process_question(agent: BasicAgent, item: dict, base_fetch_file_url: s
|
|
237 |
except Exception as decode_err:
|
238 |
logger.error(f"Could not decode file {file_name}: {decode_err}")
|
239 |
file_content = f"[Error: Could not decode file content for {file_name}]"
|
240 |
-
file_block = TextBlock(block_type="text", text=file_content)
|
241 |
elif mime_type.startswith('image/'):
|
242 |
# Pass image content directly for multi-modal models
|
243 |
file_block = ImageBlock(url=fetch_file_url, image=response.content)
|
244 |
elif mime_type.startswith('audio/'):
|
245 |
# Pass audio content directly
|
246 |
-
|
|
|
247 |
elif mime_type == 'application/pdf':
|
248 |
# PDF: Pass a text block indicating the URL for agents to handle
|
249 |
logger.info(f"PDF file detected: {file_name}. Passing reference URL.")
|
250 |
file_block = TextBlock(text=f"[Reference PDF file available at: {fetch_file_url}]")
|
|
|
|
|
|
|
|
|
251 |
# Add handling for other types like video if needed
|
252 |
# elif mime_type.startswith('video/'):
|
253 |
# logger.info(f"Video file detected: {file_name}. Passing reference URL.")
|
@@ -278,6 +327,15 @@ async def process_question(agent: BasicAgent, item: dict, base_fetch_file_url: s
|
|
278 |
submitted_answer = submitted_answer_response.content if hasattr(submitted_answer_response, 'content') else str(submitted_answer_response)
|
279 |
|
280 |
prompt = f"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
281 |
QUESTION: {question_text}
|
282 |
ANSWER: {submitted_answer}
|
283 |
INSTRUCTIONS: Based on the provided question and answer, generate a final answer that is clear, concise, and directly addresses the question.
|
@@ -329,7 +387,7 @@ async def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
329 |
return "Failed to fetch questions.", None
|
330 |
|
331 |
# 3. Process Questions
|
332 |
-
# questions_data = [questions_data[
|
333 |
for item in questions_data:
|
334 |
answers = await process_question(agent, item, fetch_file_url)
|
335 |
results_log.append(answers)
|
|
|
1 |
import os
|
2 |
import logging
|
3 |
import mimetypes
|
4 |
+
import subprocess
|
5 |
|
6 |
from typing import Any, List
|
7 |
|
8 |
import gradio as gr
|
9 |
import requests
|
10 |
import pandas as pd
|
11 |
+
import io
|
12 |
+
import torchaudio
|
13 |
+
import torchaudio.transforms as T
|
14 |
+
import whisper
|
15 |
|
16 |
from llama_index.core.agent.workflow import AgentWorkflow, ToolCallResult, ToolCall, AgentOutput
|
17 |
from llama_index.core.base.llms.types import ChatMessage, TextBlock, ImageBlock, AudioBlock
|
|
|
19 |
|
20 |
from agents.video_analyzer_agent import initialize_video_analyzer_agent
|
21 |
|
22 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
23 |
+
|
24 |
# Assuming agent initializers are in the same directory or a known path
|
25 |
# Adjust import paths if necessary based on deployment structure
|
26 |
try:
|
|
|
33 |
from agents.planner_agent import initialize_planner_agent
|
34 |
from agents.research_agent import initialize_research_agent
|
35 |
from agents.role_agent import initialize_role_agent
|
|
|
36 |
# New agents
|
37 |
from agents.advanced_validation_agent import initialize_advanced_validation_agent
|
|
|
38 |
from agents.long_context_management_agent import initialize_long_context_management_agent
|
39 |
+
from agents.synthesis_agent import initialize_synthesis_agent
|
40 |
AGENT_IMPORT_PATH = "local"
|
41 |
except ImportError as e:
|
42 |
+
print(f"Import Error: Could not find agent modules. Tried local and final_project paths. Error: {e}")
|
43 |
+
# Set initializers to None or raise error to prevent app start
|
44 |
+
initialize_image_analyzer_agent = None
|
45 |
+
# ... set all others to None ...
|
46 |
+
raise RuntimeError(f"Failed to import agent modules: {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
47 |
|
48 |
# Setup logging
|
49 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
|
|
|
52 |
# --- Constants ---
|
53 |
DEFAULT_API_URL = os.getenv("GAIA_API_URL", "https://agents-course-unit4-scoring.hf.space")
|
54 |
|
55 |
+
# --- Helper Functions ---
|
56 |
+
_whisper_model = whisper.load_model("small")
|
57 |
+
|
58 |
+
def transcribe_audio(audio_bytes: bytes) -> str:
|
59 |
+
logger.info(f"Attempting to transcribe audio file")
|
60 |
+
|
61 |
+
file_like = io.BytesIO(audio_bytes)
|
62 |
+
|
63 |
+
waveform, sample_rate = torchaudio.load(file_like)
|
64 |
+
|
65 |
+
waveform = waveform.mean(dim=0, keepdim=True) # [1, samples]
|
66 |
+
|
67 |
+
if sample_rate != 16000:
|
68 |
+
resampler = T.Resample(orig_freq=sample_rate, new_freq=16000)
|
69 |
+
waveform = resampler(waveform)
|
70 |
+
|
71 |
+
waveform = waveform.squeeze(0)
|
72 |
+
|
73 |
+
print(f"Tensor shape : {waveform.shape}, Frequency : {sample_rate} Hz")
|
74 |
+
|
75 |
+
# Load the Whisper model (lazy loading)
|
76 |
+
model: whisper.Whisper = _whisper_model # Uses default size "base" or WHISPER_MODEL_SIZE env var
|
77 |
+
if model is None:
|
78 |
+
return "Error: Failed to load Whisper model."
|
79 |
+
|
80 |
+
try:
|
81 |
+
# Perform transcription
|
82 |
+
# The transcribe function handles various audio formats via ffmpeg
|
83 |
+
result = whisper.transcribe(model=model, audio=waveform)
|
84 |
+
transcribed_text = result["text"]
|
85 |
+
detected_language = result.get("language", "unknown") # Get detected language if available
|
86 |
+
logger.info(
|
87 |
+
f"Audio transcription successful. Detected language: {detected_language}. Text length: {len(transcribed_text)}")
|
88 |
+
return transcribed_text
|
89 |
+
|
90 |
+
except Exception as e:
|
91 |
+
# Check if it might be an ffmpeg issue
|
92 |
+
if "ffmpeg" in str(e).lower():
|
93 |
+
logger.error(f"Error during transcription, possibly ffmpeg issue: {e}", exc_info=True)
|
94 |
+
# Check if ffmpeg is installed using shell command
|
95 |
+
try:
|
96 |
+
subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
|
97 |
+
# If ffmpeg is installed, the error is likely something else
|
98 |
+
return f"Error during transcription (ffmpeg seems installed): {e}"
|
99 |
+
except (FileNotFoundError, subprocess.CalledProcessError):
|
100 |
+
logger.error("ffmpeg command not found or failed. Please ensure ffmpeg is installed and in PATH.")
|
101 |
+
return "Error: ffmpeg not found or not working. Please install ffmpeg."
|
102 |
+
else:
|
103 |
+
logger.error(f"Unexpected error during transcription: {e}", exc_info=True)
|
104 |
+
return f"Error during transcription: {e}"
|
105 |
+
|
106 |
# --- Agent Initialization (Singleton Pattern) ---
|
107 |
# Initialize the agent workflow once
|
108 |
AGENT_WORKFLOW = None
|
|
|
115 |
planner_agent = initialize_planner_agent()
|
116 |
research_agent = initialize_research_agent()
|
117 |
text_analyzer_agent = initialize_text_analyzer_agent()
|
|
|
118 |
image_analyzer_agent = initialize_image_analyzer_agent()
|
119 |
reasoning_agent = initialize_reasoning_agent()
|
120 |
# New agents
|
121 |
advanced_validation_agent = initialize_advanced_validation_agent()
|
|
|
122 |
long_context_management_agent = initialize_long_context_management_agent()
|
123 |
video_analyzer_agent = initialize_video_analyzer_agent()
|
124 |
+
synthesis_agent = initialize_synthesis_agent()
|
125 |
|
126 |
# Check if all agents initialized successfully
|
127 |
all_agents = [
|
128 |
code_agent, role_agent, math_agent, planner_agent, research_agent,
|
129 |
+
text_analyzer_agent, image_analyzer_agent, reasoning_agent,
|
130 |
+
advanced_validation_agent, long_context_management_agent,
|
131 |
+
video_analyzer_agent, synthesis_agent
|
132 |
]
|
133 |
if not all(all_agents):
|
134 |
raise RuntimeError("One or more agents failed to initialize.")
|
135 |
|
136 |
AGENT_WORKFLOW = AgentWorkflow(
|
137 |
agents=all_agents,
|
138 |
+
root_agent="reasoning_agent", # Keep planner as root as per plan
|
139 |
+
initial_state={
|
140 |
+
"research_content": []
|
141 |
+
}
|
142 |
)
|
143 |
logger.info("GAIA Multi-Agent Workflow initialized successfully.")
|
144 |
except Exception as e:
|
|
|
203 |
logger.info(f"Agent returning final answer: {final_content[:500]}{'...' if len(final_content) > 500 else ''}")
|
204 |
return answer.response # Return the actual response object expected by Gradio
|
205 |
|
206 |
+
system_prompt = """
|
207 |
You are a general AI assistant.
|
208 |
I will give you a result, and with it you will have to transform it to follow the following template: FINAL ANSWER: [YOUR FINAL ANSWER].
|
209 |
YOUR FINAL ANSWER should be a number OR 1 or 2 word(s) OR a comma separated list of numbers and/or strings.
|
210 |
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
211 |
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
212 |
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
213 |
+
If the result is enclosed in double quotes (""), extract and return only what is inside the quotes, applying the formatting rules if needed.
|
|
|
|
|
214 |
|
215 |
+
You must never return a full sentence as the final answer. A sentence is strictly forbidden under all circumstances.
|
216 |
+
"""
|
217 |
|
218 |
+
llm = OpenAI(model="gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"), temperature=0.05, system_prompt=system_prompt)
|
219 |
|
220 |
# --- Helper Functions for run_and_submit_all ---
|
221 |
|
|
|
266 |
if mime_type:
|
267 |
# Prioritize specific extensions for text-like content
|
268 |
text_extensions = (
|
269 |
+
".txt", ".json", ".xml", ".yaml", ".yml", ".ini", ".cfg", ".toml", ".log", ".properties",
|
270 |
".html", ".htm", ".xhtml", ".css", ".scss", ".sass", ".less", ".svg", ".md", ".rst",
|
271 |
".py", ".js", ".java", ".c", ".cpp", ".h", ".hpp", ".cs", ".go", ".php", ".rb", ".swift", ".kt",
|
272 |
".sh", ".bat", ".ipynb", ".Rmd", ".tex" # Added more code/markup types
|
|
|
281 |
except Exception as decode_err:
|
282 |
logger.error(f"Could not decode file {file_name}: {decode_err}")
|
283 |
file_content = f"[Error: Could not decode file content for {file_name}]"
|
284 |
+
file_block = TextBlock(block_type="text", text=f"[File: {file_name}]\n[Content]:\n{file_content}")
|
285 |
elif mime_type.startswith('image/'):
|
286 |
# Pass image content directly for multi-modal models
|
287 |
file_block = ImageBlock(url=fetch_file_url, image=response.content)
|
288 |
elif mime_type.startswith('audio/'):
|
289 |
# Pass audio content directly
|
290 |
+
audio_text = transcribe_audio(response.content)
|
291 |
+
file_block = TextBlock(text=f"[Transcribed Audio: {audio_text}]")
|
292 |
elif mime_type == 'application/pdf':
|
293 |
# PDF: Pass a text block indicating the URL for agents to handle
|
294 |
logger.info(f"PDF file detected: {file_name}. Passing reference URL.")
|
295 |
file_block = TextBlock(text=f"[Reference PDF file available at: {fetch_file_url}]")
|
296 |
+
elif file_name.lower().endswith((".xlsx", ".xls", ".csv")):
|
297 |
+
logger.info(f"Data file detected: {file_name}. Passing reference URL.")
|
298 |
+
file_block = TextBlock(text=f"[Reference Data file available at: {fetch_file_url}]")
|
299 |
+
|
300 |
# Add handling for other types like video if needed
|
301 |
# elif mime_type.startswith('video/'):
|
302 |
# logger.info(f"Video file detected: {file_name}. Passing reference URL.")
|
|
|
327 |
submitted_answer = submitted_answer_response.content if hasattr(submitted_answer_response, 'content') else str(submitted_answer_response)
|
328 |
|
329 |
prompt = f"""
|
330 |
+
You are a general AI assistant.
|
331 |
+
I will give you a result, and with it you will have to transform it to follow the following template: [YOUR FINAL ANSWER].
|
332 |
+
YOUR FINAL ANSWER should be a number OR 1 or 2 word(s) OR a comma separated list of numbers and/or strings.
|
333 |
+
If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise.
|
334 |
+
If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise.
|
335 |
+
If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.
|
336 |
+
If the result is enclosed in double quotes (""), extract and return only what is inside the quotes, applying the formatting rules if needed.
|
337 |
+
|
338 |
+
You must never return a full sentence as the final answer. A sentence is strictly forbidden under all circumstances.
|
339 |
QUESTION: {question_text}
|
340 |
ANSWER: {submitted_answer}
|
341 |
INSTRUCTIONS: Based on the provided question and answer, generate a final answer that is clear, concise, and directly addresses the question.
|
|
|
387 |
return "Failed to fetch questions.", None
|
388 |
|
389 |
# 3. Process Questions
|
390 |
+
# questions_data = [questions_data[6]]
|
391 |
for item in questions_data:
|
392 |
answers = await process_question(agent, item, fetch_file_url)
|
393 |
results_log.append(answers)
|
cookies.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Netscape HTTP Cookie File
|
2 |
+
# This file is generated by yt-dlp. Do not edit.
|
3 |
+
|
4 |
+
.youtube.com TRUE / TRUE 1772810415 SOCS CAESEwgDEgk3MjI0NDY2OTcaAmZyIAEaBgiAsYW9Bg
|
5 |
+
.youtube.com TRUE / TRUE 1761928305 __Secure-ROLLOUT_TOKEN CK2CtoGE9_qCKBDVmd6PqaqLAxiimOLanoqNAw%3D%3D
|
6 |
+
.youtube.com TRUE / TRUE 1780502573 __Secure-3PAPISID Es8lNi2tKH_w6xhk/A5yqk-H0g0K329X1o
|
7 |
+
.youtube.com TRUE / TRUE 1780502573 __Secure-3PSID g.a000wQiyYVADhJ0IDxSDoFRi6cUtMUs77vKiv67cKlhfBRyt8TWh6wP59ja7Ccu2lEPStG0E9gACgYKAT0SARASFQHGX2MirUvQtL-jYTLLnMocXq-eVxoVAUF8yKq2SkbS4FUiTUqswnomlXXS0076
|
8 |
+
.youtube.com TRUE / TRUE 1772810410 __Secure-YEC CgtaV0NPeGJVRncxZyjwrN7ABjInCgJCRRIhEh0SGwsMDg8QERITFBUWFxgZGhscHR4fICEiIyQlJiAy
|
9 |
+
.youtube.com TRUE / TRUE 1761928305 VISITOR_PRIVACY_METADATA CgJCRRIhEh0SGwsMDg8QERITFBUWFxgZGhscHR4fICEiIyQlJiAa
|
10 |
+
.youtube.com TRUE / TRUE 1777608592 __Secure-1PSIDTS sidts-CjIBjplskMR_wI_-sxQCXIh_A5wIobOdoONYRAp32Akkn2F4h29lH76FjI_xUvu93oO86RAA
|
11 |
+
.youtube.com TRUE / TRUE 1777608592 __Secure-3PSIDTS sidts-CjIBjplskMR_wI_-sxQCXIh_A5wIobOdoONYRAp32Akkn2F4h29lH76FjI_xUvu93oO86RAA
|
12 |
+
.youtube.com TRUE / TRUE 1777608592 __Secure-3PSIDCC AKEyXzWgzZK09pWnLsejhZtkJMtHpvkaFBG7tdOEw0-1NRBycYj8xzr1E8FDKS6ZptP5vsTiSRi2
|
13 |
+
.youtube.com TRUE / FALSE 0 PREF f4=4000000&tz=UTC&f7=100&f6=400&f5=30000&hl=en
|
14 |
+
.youtube.com TRUE / TRUE 1809448305 __Secure-YT_TVFAS t=485104&s=3
|
15 |
+
.youtube.com TRUE / TRUE 1761928305 DEVICE_INFO ChxOelV3TURZeU9URXhOemN5TVRRM05EazRPQT09EPGs3sAGGPGs3sAG
|
16 |
+
.youtube.com TRUE / TRUE 1761928305 VISITOR_INFO1_LIVE gMG4-pZ0QBE
|
gaia_improvement_plan.md
DELETED
@@ -1,943 +0,0 @@
|
|
1 |
-
|
2 |
-
### 3.5. `research_agent.py` Refactoring
|
3 |
-
|
4 |
-
* **Rationale:** To improve browser instance management, error handling, and configuration.
|
5 |
-
* **Proposals:**
|
6 |
-
1. **Browser Lifecycle Management:** Instead of initializing the browser (`start_chrome`) at the module level, manage its lifecycle explicitly. Options:
|
7 |
-
* Initialize the browser within the agent's initialization and provide a method or tool to explicitly close it (`kill_browser`) when the agent's task is done or the application shuts down.
|
8 |
-
* Use a context manager (`with start_chrome(...) as browser:`) if the browser is only needed for a specific scope within a tool call (less likely for a persistent agent).
|
9 |
-
* Ensure `kill_browser` is reliably called. Perhaps the `planner_agent` could invoke a cleanup tool/method on the `research_agent` after its tasks are complete.
|
10 |
-
2. **Configuration:** Move hardcoded Chrome options to configuration. Externalize API keys/IDs if not already done (they seem to be using `os.getenv`, which is good).
|
11 |
-
3. **Robust Error Handling:** For browser interaction tools (`visit`, `get_text_by_css`, `click_element`), raise specific custom exceptions instead of returning error strings. This allows for more structured error handling by the agent or workflow.
|
12 |
-
4. **Tool Consolidation (Optional):** The agent has many tools. Consider if some related tools (e.g., different search APIs) could be consolidated behind a single tool that internally chooses the best source, or if the LLM handles the large toolset effectively.
|
13 |
-
|
14 |
-
* **Diff Patch (Illustrative - Configuration & Browser Init):**
|
15 |
-
|
16 |
-
```diff
|
17 |
-
--- a/research_agent.py
|
18 |
-
+++ b/research_agent.py
|
19 |
-
@@ -1,5 +1,6 @@
|
20 |
-
import os
|
21 |
-
import time
|
22 |
-
+ import logging
|
23 |
-
from typing import List
|
24 |
-
|
25 |
-
from llama_index.core.agent.workflow import ReActAgent
|
26 |
-
@@ -15,17 +16,21 @@
|
27 |
-
from helium import start_chrome, go_to, find_all, Text, kill_browser
|
28 |
-
from helium import get_driver
|
29 |
-
|
30 |
-
+ logger = logging.getLogger(__name__)
|
31 |
-
+
|
32 |
-
# 1. Helium
|
33 |
-
-chrome_options = webdriver.ChromeOptions()
|
34 |
-
-chrome_options.add_argument("--no-sandbox")
|
35 |
-
-chrome_options.add_argument("--disable-dev-shm-usage")
|
36 |
-
-chrome_options.add_experimental_option("prefs", {
|
37 |
-
- "download.prompt_for_download": False,
|
38 |
-
- "plugins.always_open_pdf_externally": True,
|
39 |
-
- "profile.default_content_settings.popups": 0
|
40 |
-
-})
|
41 |
-
-
|
42 |
-
-browser = start_chrome(headless=True, options=chrome_options)
|
43 |
-
+# Browser instance should be managed, not global at module level
|
44 |
-
+# browser = start_chrome(headless=True, options=chrome_options)
|
45 |
-
+
|
46 |
-
+def get_chrome_options():
|
47 |
-
+ options = webdriver.ChromeOptions()
|
48 |
-
+ if os.getenv("RESEARCH_AGENT_CHROME_NO_SANDBOX", "true").lower() == "true":
|
49 |
-
+ options.add_argument("--no-sandbox")
|
50 |
-
+ if os.getenv("RESEARCH_AGENT_CHROME_DISABLE_DEV_SHM", "true").lower() == "true":
|
51 |
-
+ options.add_argument("--disable-dev-shm-usage")
|
52 |
-
+ # Add other options from config as needed
|
53 |
-
+ # options.add_experimental_option(...) # Example
|
54 |
-
+ return options
|
55 |
-
|
56 |
-
def visit(url: str, wait_seconds: float = 2.0) -> str |None:
|
57 |
-
"""
|
58 |
-
@@ -36,10 +41,11 @@
|
59 |
-
wait_seconds (float): Time to wait after navigation.
|
60 |
-
"""
|
61 |
-
try:
|
62 |
-
+ # Assumes browser is available in context (e.g., class member)
|
63 |
-
go_to(url)
|
64 |
-
time.sleep(wait_seconds)
|
65 |
-
return f"Visited: {url}"
|
66 |
-
except Exception as e:
|
67 |
-
+ logger.error(f"Error visiting {url}: {e}", exc_info=True)
|
68 |
-
return f"Error visiting {url}: {e}"
|
69 |
-
|
70 |
-
def get_text_by_css(selector: str) -> List[str] | str:
|
71 |
-
@@ -52,13 +58,15 @@
|
72 |
-
List[str]: List of text contents.
|
73 |
-
"""
|
74 |
-
try:
|
75 |
-
+ # Assumes browser/helium context is active
|
76 |
-
if selector.lower() == 'body':
|
77 |
-
elements = find_all(Text())
|
78 |
-
else:
|
79 |
-
elements = find_all(selector)
|
80 |
-
texts = [elem.web_element.text for elem in elements]
|
81 |
-
- print(f"Extracted {len(texts)} elements for selector \'{selector}\'")
|
82 |
-
+ logger.info(f"Extracted {len(texts)} elements for selector \'{selector}\'")
|
83 |
-
return texts
|
84 |
-
except Exception as e:
|
85 |
-
+ logger.error(f"Error extracting text for selector {selector}: {e}", exc_info=True)
|
86 |
-
return f"Error extracting text for selector {selector}: {e}"
|
87 |
-
|
88 |
-
def get_page_html() -> str:
|
89 |
-
@@ -70,9 +78,11 @@
|
90 |
-
str: HTML content, or empty string on error.
|
91 |
-
"""
|
92 |
-
try:
|
93 |
-
+ # Assumes browser/helium context is active
|
94 |
-
driver = get_driver()
|
95 |
-
html = driver.page_source
|
96 |
-
return html
|
97 |
-
except Exception as e:
|
98 |
-
+ logger.error(f"Error extracting HTML: {e}", exc_info=True)
|
99 |
-
return f"Error extracting HTML: {e}"
|
100 |
-
|
101 |
-
def click_element(selector: str, index_element: int = 0) -> str:
|
102 |
-
@@ -83,10 +93,12 @@
|
103 |
-
selector (str): CSS selector of the element to click.
|
104 |
-
"""
|
105 |
-
try:
|
106 |
-
+ # Assumes browser/helium context is active
|
107 |
-
element = find_all(selector)[index_element]
|
108 |
-
element.click()
|
109 |
-
time.sleep(1)
|
110 |
-
return f"Clicked element matching selector \'{selector}\'"
|
111 |
-
except Exception as e:
|
112 |
-
+ logger.error(f"Error clicking element {selector}: {e}", exc_info=True)
|
113 |
-
return f"Error clicking element {selector}: {e}"
|
114 |
-
|
115 |
-
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
|
116 |
-
@@ -97,6 +109,7 @@
|
117 |
-
nth_result: Which occurrence to jump to (default: 1)
|
118 |
-
"""
|
119 |
-
elements = browser.find_elements(By.XPATH, f"//*[contains(text(), \'{text}\')]")
|
120 |
-
+ # Assumes browser is available in context
|
121 |
-
if nth_result > len(elements):
|
122 |
-
return f"Match n°{nth_result} not found (only {len(elements)} matches found)"
|
123 |
-
result = f"Found {len(elements)} matches for \'{text}\'."
|
124 |
-
@@ -107,19 +120,22 @@
|
125 |
-
|
126 |
-
def go_back() -> None:
|
127 |
-
"""Goes back to previous page."""
|
128 |
-
browser.back()
|
129 |
-
+ # Assumes browser is available in context
|
130 |
-
|
131 |
-
def close_popups() -> None:
|
132 |
-
"""
|
133 |
-
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
|
134 |
-
"""
|
135 |
-
webdriver.ActionChains(browser).send_keys(Keys.ESCAPE).perform()
|
136 |
-
+ # Assumes browser is available in context
|
137 |
-
|
138 |
-
def close() -> None:
|
139 |
-
"""
|
140 |
-
Close the browser instance.
|
141 |
-
"""
|
142 |
-
try:
|
143 |
-
+ # Assumes kill_browser is appropriate here
|
144 |
-
kill_browser()
|
145 |
-
- print("Browser closed")
|
146 |
-
+ logger.info("Browser closed via kill_browser()")
|
147 |
-
except Exception as e:
|
148 |
-
- print(f"Error closing browser: {e}")
|
149 |
-
+ logger.error(f"Error closing browser: {e}", exc_info=True)
|
150 |
-
|
151 |
-
visit_tool = FunctionTool.from_defaults(
|
152 |
-
fn=visit,
|
153 |
-
@@ -240,9 +256,14 @@
|
154 |
-
|
155 |
-
|
156 |
-
def initialize_research_agent() -> ReActAgent:
|
157 |
-
+ # Browser initialization should happen here or be managed externally
|
158 |
-
+ # Example: browser = start_chrome(headless=True, options=get_chrome_options())
|
159 |
-
+ # Ensure browser instance is passed to tools or accessible via agent state/class
|
160 |
-
+
|
161 |
-
+ llm_model_name = os.getenv("RESEARCH_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
162 |
-
llm = GoogleGenAI(
|
163 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
164 |
-
- model="models/gemini-1.5-pro",
|
165 |
-
+ model=llm_model_name,
|
166 |
-
)
|
167 |
-
|
168 |
-
system_prompt = """\
|
169 |
-
```
|
170 |
-
|
171 |
-
|
172 |
-
### 3.6. `text_analyzer_agent.py` Refactoring
|
173 |
-
|
174 |
-
* **Rationale:** To improve configuration management and error handling.
|
175 |
-
* **Proposals:**
|
176 |
-
1. **Configuration:** Move the hardcoded LLM model name (`models/gemini-1.5-pro`) to environment variables or a configuration file.
|
177 |
-
2. **Prompt Management:** Move the `analyze_text` prompt to a separate template file.
|
178 |
-
3. **Error Handling:** In `extract_text_from_pdf`, consider raising specific exceptions (e.g., `PDFDownloadError`, `PDFParsingError`) instead of returning error strings, allowing the agent to handle failures more gracefully.
|
179 |
-
|
180 |
-
* **Diff Patch (Illustrative - Configuration & Error Handling):**
|
181 |
-
|
182 |
-
```diff
|
183 |
-
--- a/text_analyzer_agent.py
|
184 |
-
+++ b/text_analyzer_agent.py
|
185 |
-
@@ -6,6 +6,14 @@
|
186 |
-
|
187 |
-
logger = logging.getLogger(__name__)
|
188 |
-
|
189 |
-
+ class PDFExtractionError(Exception):
|
190 |
-
+ """Custom exception for PDF extraction failures."""
|
191 |
-
+ pass
|
192 |
-
+
|
193 |
-
+ class PDFDownloadError(PDFExtractionError):
|
194 |
-
+ """Custom exception for PDF download failures."""
|
195 |
-
+ pass
|
196 |
-
+
|
197 |
-
def extract_text_from_pdf(source: str) -> str:
|
198 |
-
"""
|
199 |
-
Extract raw text from a PDF file on disk or at a URL.
|
200 |
-
@@ -19,21 +27,21 @@
|
201 |
-
try:
|
202 |
-
resp = requests.get(source, timeout=10)
|
203 |
-
resp.raise_for_status()
|
204 |
-
- except Exception as e:
|
205 |
-
- return f"Error downloading PDF from {source}: {e}"
|
206 |
-
+ except requests.exceptions.RequestException as e:
|
207 |
-
+ raise PDFDownloadError(f"Error downloading PDF from {source}: {e}") from e
|
208 |
-
|
209 |
-
try:
|
210 |
-
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".pdf")
|
211 |
-
tmp.write(resp.content)
|
212 |
-
tmp.flush()
|
213 |
-
tmp_path = tmp.name
|
214 |
-
tmp.close()
|
215 |
-
- except Exception as e:
|
216 |
-
- return f"Error writing temp PDF file: {e}"
|
217 |
-
+ except IOError as e:
|
218 |
-
+ raise PDFExtractionError(f"Error writing temp PDF file: {e}") from e
|
219 |
-
path = tmp_path
|
220 |
-
else:
|
221 |
-
path = source
|
222 |
-
|
223 |
-
# Now extract text from the PDF on disk
|
224 |
-
if not os.path.isfile(path):
|
225 |
-
- return f"PDF not found: {path}"
|
226 |
-
+ raise PDFExtractionError(f"PDF not found: {path}")
|
227 |
-
|
228 |
-
text = ""
|
229 |
-
|
230 |
-
@@ -41,10 +49,10 @@
|
231 |
-
reader = PdfReader(path)
|
232 |
-
pages = [page.extract_text() or "" for page in reader.pages]
|
233 |
-
text = "\n".join(pages)
|
234 |
-
- print(f"Extracted {len(pages)} pages of text from PDF")
|
235 |
-
+ logger.info(f"Extracted {len(pages)} pages of text from PDF: {path}")
|
236 |
-
except Exception as e:
|
237 |
-
# Catch specific PyPDF2 errors if possible, otherwise general Exception
|
238 |
-
- return f"Error reading PDF: {e}"
|
239 |
-
+ raise PDFExtractionError(f"Error reading PDF {path}: {e}") from e
|
240 |
-
|
241 |
-
# Clean up temporary file if one was created
|
242 |
-
if source.lower().startswith(("http://", "https://")):
|
243 |
-
@@ -67,6 +75,14 @@
|
244 |
-
str: A plain-text string containing:
|
245 |
-
• A “Summary:” section with bullet points.
|
246 |
-
• A “Facts:” section with bullet points.
|
247 |
-
+ """
|
248 |
-
+ # Load prompt from file ideally
|
249 |
-
+ prompt_template = """You are an expert analyst.
|
250 |
-
+
|
251 |
-
+ Please analyze the following text and produce a plain-text response
|
252 |
-
+ with two sections:
|
253 |
-
+
|
254 |
-
+ Summary:
|
255 |
-
+ • Provide 2–3 concise bullet points summarizing the main ideas.
|
256 |
-
+
|
257 |
-
+ Facts:
|
258 |
-
+ • List each verifiable fact found in the text as a bullet point.
|
259 |
-
+
|
260 |
-
+ Respond with exactly that format—no JSON, no extra commentary.
|
261 |
-
+
|
262 |
-
+ Text to analyze:
|
263 |
-
+ \"\"\"
|
264 |
-
+ {text}
|
265 |
-
+ \"\"\"
|
266 |
-
"""
|
267 |
-
# Build the prompt to guide the LLM’s output format
|
268 |
-
input_prompt = f"""You are an expert analyst.
|
269 |
-
@@ -84,13 +100,14 @@
|
270 |
-
{text}
|
271 |
-
\"\"\"
|
272 |
-
"""
|
273 |
-
+ input_prompt = prompt_template.format(text=text)
|
274 |
-
|
275 |
-
# Use the LLM to generate the analysis
|
276 |
-
+ llm_model_name = os.getenv("TEXT_ANALYZER_LLM_MODEL", "models/gemini-1.5-pro")
|
277 |
-
llm = GoogleGenAI(
|
278 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
279 |
-
- model="models/gemini-1.5-pro",
|
280 |
-
+ model=llm_model_name,
|
281 |
-
)
|
282 |
-
|
283 |
-
generated = llm.complete(input_prompt)
|
284 |
-
@@ -124,9 +141,10 @@
|
285 |
-
FunctionAgent: Configured analysis agent.
|
286 |
-
"""
|
287 |
-
|
288 |
-
+ llm_model_name = os.getenv("TEXT_ANALYZER_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
289 |
-
llm = GoogleGenAI(
|
290 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
291 |
-
- model="models/gemini-1.5-pro",
|
292 |
-
+ model=llm_model_name,
|
293 |
-
)
|
294 |
-
|
295 |
-
system_prompt = """\
|
296 |
-
```
|
297 |
-
|
298 |
-
|
299 |
-
### 3.7. `reasoning_agent.py` Refactoring
|
300 |
-
|
301 |
-
* **Rationale:** To simplify the agent structure, improve configuration, and potentially optimize LLM usage.
|
302 |
-
* **Proposals:**
|
303 |
-
1. **Configuration:** Move hardcoded LLM model names (`models/gemini-1.5-pro`, `o4-mini`) and the API key environment variable name (`ALPAFLOW_OPENAI_API_KEY`) to configuration.
|
304 |
-
2. **Prompt Management:** Move the detailed CoT prompt from `reasoning_tool_fn` to a separate template file.
|
305 |
-
3. **Agent Structure Simplification:** Given the rigid workflow (call tool -> handoff), consider replacing the `ReActAgent` with a simpler `FunctionAgent` that directly calls the `reasoning_tool` and formats the output before handing off. Alternatively, evaluate if the `reasoning_tool` logic could be integrated as a direct LLM call within agents that need CoT (like `planner_agent`), potentially removing the need for a separate `reasoning_agent` altogether, unless its specific CoT prompt/model (`o4-mini`) is crucial.
|
306 |
-
|
307 |
-
* **Diff Patch (Illustrative - Configuration & Prompt Loading):**
|
308 |
-
|
309 |
-
```diff
|
310 |
-
--- a/reasoning_agent.py
|
311 |
-
+++ b/reasoning_agent.py
|
312 |
-
@@ -1,10 +1,19 @@
|
313 |
-
import os
|
314 |
-
+ import logging
|
315 |
-
|
316 |
-
from llama_index.core.agent.workflow import ReActAgent
|
317 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
318 |
-
from llama_index.core.tools import FunctionTool
|
319 |
-
from llama_index.llms.openai import OpenAI
|
320 |
-
|
321 |
-
+ logger = logging.getLogger(__name__)
|
322 |
-
+
|
323 |
-
+ def load_prompt_from_file(filename="reasoning_tool_prompt.txt") -> str:
|
324 |
-
+ try:
|
325 |
-
+ with open(filename, "r") as f:
|
326 |
-
+ return f.read()
|
327 |
-
+ except FileNotFoundError:
|
328 |
-
+ logger.error(f"Prompt file {filename} not found.")
|
329 |
-
+ return "Perform chain-of-thought reasoning on the context: {context}"
|
330 |
-
+
|
331 |
-
def reasoning_tool_fn(context: str) -> str:
|
332 |
-
"""
|
333 |
-
Perform end-to-end chain-of-thought reasoning over the full multi-agent workflow context,
|
334 |
-
@@ -17,45 +26,12 @@
|
335 |
-
str: A structured reasoning trace with numbered thought steps, intermediate checks,
|
336 |
-
and a concise final recommendation or conclusion.
|
337 |
-
"""
|
338 |
-
- prompt = f"""You are an expert reasoning engine. You have the following full context of a multi-agent workflow:
|
339 |
-
-
|
340 |
-
- {context}
|
341 |
-
-
|
342 |
-
- Your job is to:
|
343 |
-
- 1. **Comprehension**
|
344 |
-
- - Read the entire question or problem statement carefully.
|
345 |
-
- - Identify key terms, constraints, and desired outcomes.
|
346 |
-
-
|
347 |
-
- 2. **Decomposition**
|
348 |
-
- - Break down the problem into logical sub-steps or sub-questions.
|
349 |
-
- - Ensure each sub-step is necessary and sufficient to progress toward a solution.
|
350 |
-
-
|
351 |
-
- 3. **Chain-of-Thought**
|
352 |
-
- - Articulate your internal reasoning in clear, numbered steps.
|
353 |
-
- - At each step, state your assumptions, derive implications, and check for consistency.
|
354 |
-
-
|
355 |
-
- 4. **Intermediate Verification**
|
356 |
-
- - After each reasoning step, validate your conclusion against the problem’s constraints.
|
357 |
-
- - If a contradiction or uncertainty arises, revisit and refine the previous step.
|
358 |
-
-
|
359 |
-
- 5. **Synthesis**
|
360 |
-
- - Once all sub-steps are resolved, integrate the intermediate results into a cohesive answer.
|
361 |
-
- - Ensure the final answer directly addresses the user’s request and all specified criteria.
|
362 |
-
-
|
363 |
-
- 6. **Clarity & Precision**
|
364 |
-
- - Use formal, precise language.
|
365 |
-
- - Avoid ambiguity: define any technical terms you introduce.
|
366 |
-
- - Provide just enough detail to justify each conclusion without digression.
|
367 |
-
-
|
368 |
-
- 7. **Final Answer**
|
369 |
-
- - Present a concise, well-structured response.
|
370 |
-
- - If appropriate, include a brief summary of your reasoning steps.
|
371 |
-
-
|
372 |
-
- Respond with your reasoning steps followed by the final recommendation.
|
373 |
-
- """
|
374 |
-
+ prompt_template = load_prompt_from_file()
|
375 |
-
+ prompt = prompt_template.format(context=context)
|
376 |
-
|
377 |
-
+ reasoning_llm_model = os.getenv("REASONING_TOOL_LLM_MODEL", "o4-mini")
|
378 |
-
+ # Use specific API key if needed, e.g., ALPAFLOW_OPENAI_API_KEY
|
379 |
-
+ reasoning_api_key_env = os.getenv("REASONING_TOOL_API_KEY_ENV", "ALPAFLOW_OPENAI_API_KEY")
|
380 |
-
+ reasoning_api_key = os.getenv(reasoning_api_key_env)
|
381 |
-
llm = OpenAI(
|
382 |
-
- model="o4-mini",
|
383 |
-
- api_key=os.getenv("ALPAFLOW_OPENAI_API_KEY"),
|
384 |
-
+ model=reasoning_llm_model,
|
385 |
-
+ api_key=reasoning_api_key,
|
386 |
-
reasoning_effort="high"
|
387 |
-
)
|
388 |
-
response = llm.complete(prompt)
|
389 |
-
@@ -74,9 +50,10 @@
|
390 |
-
"""
|
391 |
-
Create a pure reasoning agent with no tools, relying solely on chain-of-thought.
|
392 |
-
"""
|
393 |
-
+ agent_llm_model = os.getenv("REASONING_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
394 |
-
llm = GoogleGenAI(
|
395 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
396 |
-
- model="models/gemini-1.5-pro",
|
397 |
-
+ model=agent_llm_model,
|
398 |
-
)
|
399 |
-
|
400 |
-
system_prompt = """\
|
401 |
-
```
|
402 |
-
|
403 |
-
|
404 |
-
### 3.8. `planner_agent.py` Refactoring
|
405 |
-
|
406 |
-
* **Rationale:** To improve configuration management and prompt handling.
|
407 |
-
* **Proposals:**
|
408 |
-
1. **Configuration:** Move the hardcoded LLM model name (`models/gemini-1.5-pro`) to environment variables or a configuration file.
|
409 |
-
2. **Prompt Management:** Move the system prompt and the prompts within the `plan` and `synthesize_and_respond` functions to separate template files for better readability and maintainability.
|
410 |
-
|
411 |
-
* **Diff Patch (Illustrative - Configuration & Prompt Loading):**
|
412 |
-
|
413 |
-
```diff
|
414 |
-
--- a/planner_agent.py
|
415 |
-
+++ b/planner_agent.py
|
416 |
-
@@ -1,10 +1,19 @@
|
417 |
-
import os
|
418 |
-
+ import logging
|
419 |
-
from typing import List, Any
|
420 |
-
|
421 |
-
from llama_index.core.agent.workflow import FunctionAgent, ReActAgent
|
422 |
-
from llama_index.core.tools import FunctionTool
|
423 |
-
from llama_index.llms.google_genai import GoogleGenAI
|
424 |
-
|
425 |
-
+ logger = logging.getLogger(__name__)
|
426 |
-
+
|
427 |
-
+ def load_prompt_from_file(filename: str, default_prompt: str) -> str:
|
428 |
-
+ try:
|
429 |
-
+ with open(filename, "r") as f:
|
430 |
-
+ return f.read()
|
431 |
-
+ except FileNotFoundError:
|
432 |
-
+ logger.warning(f"Prompt file {filename} not found. Using default.")
|
433 |
-
+ return default_prompt
|
434 |
-
+
|
435 |
-
def plan(objective: str) -> List[str]:
|
436 |
-
"""
|
437 |
-
Generate a list of sub-questions from the given objective.
|
438 |
-
@@ -15,14 +24,16 @@
|
439 |
-
Returns:
|
440 |
-
List[str]: A list of sub-steps as strings.
|
441 |
-
"""
|
442 |
-
- input_prompt: str = (
|
443 |
-
+ default_plan_prompt = (
|
444 |
-
"You are a research assistant. "
|
445 |
-
"Given an objective, break it down into a list of concise, actionable sub-steps.\n"
|
446 |
-
f"Objective: {objective}\n"
|
447 |
-
"Sub-steps (one per line):"
|
448 |
-
)
|
449 |
-
+ plan_prompt_template = load_prompt_from_file("planner_plan_prompt.txt", default_plan_prompt)
|
450 |
-
+ input_prompt = plan_prompt_template.format(objective=objective)
|
451 |
-
|
452 |
-
+ llm_model_name = os.getenv("PLANNER_TOOL_LLM_MODEL", "models/gemini-1.5-pro")
|
453 |
-
llm = GoogleGenAI(
|
454 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
455 |
-
- model="models/gemini-1.5-pro",
|
456 |
-
+ model=llm_model_name,
|
457 |
-
)
|
458 |
-
|
459 |
-
|
460 |
-
@@ -44,13 +55,16 @@
|
461 |
-
Returns:
|
462 |
-
str: A unified, well-structured response addressing the original objective.
|
463 |
-
"""
|
464 |
-
- # Join each ready-made QA block directly
|
465 |
-
summary_blocks = "\n".join(results)
|
466 |
-
- input_prompt = f"""You are an expert synthesizer. Given the following sub-questions and their answers,
|
467 |
-
+ default_synth_prompt = f"""You are an expert synthesizer. Given the following sub-questions and their answers,
|
468 |
-
produce a single, coherent, comprehensive report that addresses the original objective:
|
469 |
-
|
470 |
-
{summary_blocks}
|
471 |
-
|
472 |
-
Final Report:
|
473 |
-
"""
|
474 |
-
+ synth_prompt_template = load_prompt_from_file("planner_synthesize_prompt.txt", default_synth_prompt)
|
475 |
-
+ input_prompt = synth_prompt_template.format(summary_blocks=summary_blocks)
|
476 |
-
+
|
477 |
-
+ llm_model_name = os.getenv("PLANNER_TOOL_LLM_MODEL", "models/gemini-1.5-pro") # Can use same model as plan
|
478 |
-
llm = GoogleGenAI(
|
479 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
480 |
-
- model="models/gemini-1.5-pro",
|
481 |
-
+ model=llm_model_name,
|
482 |
-
)
|
483 |
-
response = llm.complete(input_prompt)
|
484 |
-
return response.text
|
485 |
-
@@ -77,9 +91,10 @@
|
486 |
-
"""
|
487 |
-
Initialize a LlamaIndex agent specialized in research planning and question engineering.
|
488 |
-
"""
|
489 |
-
+ agent_llm_model = os.getenv("PLANNER_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
490 |
-
llm = GoogleGenAI(
|
491 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
492 |
-
- model="models/gemini-1.5-pro",
|
493 |
-
+ model=agent_llm_model,
|
494 |
-
)
|
495 |
-
|
496 |
-
system_prompt = """\
|
497 |
-
@@ -108,6 +123,7 @@
|
498 |
-
**Completion & Synthesis**
|
499 |
-
If the final result fully completes the original objective, produce a consolidated synthesis of the roadmap and send it as your concluding output.
|
500 |
-
"""
|
501 |
-
+ system_prompt = load_prompt_from_file("planner_system_prompt.txt", system_prompt) # Load from file if exists
|
502 |
-
|
503 |
-
agent = ReActAgent(
|
504 |
-
name="planner_agent",
|
505 |
-
```
|
506 |
-
|
507 |
-
|
508 |
-
### 3.9. `code_agent.py` Refactoring
|
509 |
-
|
510 |
-
* **Rationale:** To address the critical security vulnerability of the `SimpleCodeExecutor`, improve configuration management, and align code execution with safer practices.
|
511 |
-
* **Proposals:**
|
512 |
-
1. **Remove `SimpleCodeExecutor`:** This class and its `execute` method using `subprocess` with raw code strings are fundamentally insecure and **must be removed entirely**.
|
513 |
-
2. **Use `CodeInterpreterToolSpec`:** Rely *exclusively* on the `code_interpreter` tool derived from LlamaIndex's `CodeInterpreterToolSpec` for code execution. This tool is designed for safer, sandboxed execution.
|
514 |
-
3. **Update `CodeActAgent` Initialization:** Remove the `code_execute_fn` parameter when initializing `CodeActAgent`, as the agent should use the provided `code_interpreter` tool for execution via the standard ReAct/Act loop, not a direct execution function.
|
515 |
-
4. **Configuration:** Move hardcoded LLM model names (`o4-mini`, `models/gemini-1.5-pro`) and the API key environment variable name (`ALPAFLOW_OPENAI_API_KEY`) to configuration.
|
516 |
-
5. **Prompt Management:** Move the `generate_python_code` prompt to a separate template file.
|
517 |
-
|
518 |
-
* **Diff Patch (Illustrative - Security Fix & Configuration):**
|
519 |
-
|
520 |
-
```diff
|
521 |
-
--- a/code_agent.py
|
522 |
-
+++ b/code_agent.py
|
523 |
-
@@ -1,5 +1,6 @@
|
524 |
-
import os
|
525 |
-
import subprocess
|
526 |
-
+ import logging
|
527 |
-
|
528 |
-
from llama_index.core.agent.workflow import ReActAgent, CodeActAgent
|
529 |
-
from llama_index.core.tools import FunctionTool
|
530 |
-
@@ -7,6 +8,16 @@
|
531 |
-
from llama_index.llms.openai import OpenAI
|
532 |
-
from llama_index.tools.code_interpreter import CodeInterpreterToolSpec
|
533 |
-
|
534 |
-
+ logger = logging.getLogger(__name__)
|
535 |
-
+
|
536 |
-
+ def load_prompt_from_file(filename: str, default_prompt: str) -> str:
|
537 |
-
+ try:
|
538 |
-
+ with open(filename, "r") as f:
|
539 |
-
+ return f.read()
|
540 |
-
+ except FileNotFoundError:
|
541 |
-
+ logger.warning(f"Prompt file {filename} not found. Using default.")
|
542 |
-
+ return default_prompt
|
543 |
-
+
|
544 |
-
def generate_python_code(prompt: str) -> str:
|
545 |
-
"""
|
546 |
-
Generate valid Python code from a natural language description.
|
547 |
-
@@ -27,7 +38,7 @@
|
548 |
-
it before execution.
|
549 |
-
- This function only generates code and does not execute it.
|
550 |
-
"""
|
551 |
-
-
|
552 |
-
- input_prompt = f"""You are also a helpful assistant that writes Python code.
|
553 |
-
+ default_gen_prompt = f"""You are also a helpful assistant that writes Python code.
|
554 |
-
You will be given a prompt and you must generate Python code based on that prompt.
|
555 |
-
You must only generate Python code and nothing else.
|
556 |
-
Do not include any explanations or any other text.
|
557 |
-
@@ -40,10 +51,14 @@
|
558 |
-
Code:\n
|
559 |
-
"""
|
560 |
-
|
561 |
-
+ gen_prompt_template = load_prompt_from_file("code_gen_prompt.txt", default_gen_prompt)
|
562 |
-
+ input_prompt = gen_prompt_template.format(prompt=prompt)
|
563 |
-
+
|
564 |
-
+ gen_llm_model = os.getenv("CODE_GEN_LLM_MODEL", "o4-mini")
|
565 |
-
+ gen_api_key_env = os.getenv("CODE_GEN_API_KEY_ENV", "ALPAFLOW_OPENAI_API_KEY")
|
566 |
-
+ gen_api_key = os.getenv(gen_api_key_env)
|
567 |
-
llm = OpenAI(
|
568 |
-
- model="o4-mini",
|
569 |
-
- api_key=os.getenv("ALPAFLOW_OPENAI_API_KEY")
|
570 |
-
+ model=gen_llm_model,
|
571 |
-
+ api_key=gen_api_key
|
572 |
-
)
|
573 |
-
|
574 |
-
generated_code = llm.complete(input_prompt)
|
575 |
-
@@ -74,60 +89,11 @@
|
576 |
-
),
|
577 |
-
)
|
578 |
-
|
579 |
-
-from typing import Any, Dict, Tuple
|
580 |
-
-import io
|
581 |
-
-import contextlib
|
582 |
-
-import ast
|
583 |
-
-import traceback
|
584 |
-
-
|
585 |
-
-
|
586 |
-
-class SimpleCodeExecutor:
|
587 |
-
- """
|
588 |
-
- A simple code executor that runs Python code with state persistence.
|
589 |
-
-
|
590 |
-
- This executor maintains a global and local state between executions,
|
591 |
-
- allowing for variables to persist across multiple code runs.
|
592 |
-
-
|
593 |
-
- NOTE: not safe for production use! Use with caution.
|
594 |
-
- """
|
595 |
-
-
|
596 |
-
- def __init__(self):
|
597 |
-
- pass
|
598 |
-
-
|
599 |
-
- def execute(self, code: str) -> str:
|
600 |
-
- """
|
601 |
-
- Execute Python code and capture output and return values.
|
602 |
-
-
|
603 |
-
- Args:
|
604 |
-
- code: Python code to execute
|
605 |
-
-
|
606 |
-
- Returns:
|
607 |
-
- Dict with keys `success`, `output`, and `return_value`
|
608 |
-
- """
|
609 |
-
- print(f"Executing code: {code}")
|
610 |
-
- try:
|
611 |
-
- result = subprocess.run(
|
612 |
-
- ["python", code],
|
613 |
-
- stdout=subprocess.PIPE,
|
614 |
-
- stderr=subprocess.PIPE,
|
615 |
-
- text=True,
|
616 |
-
- timeout=60
|
617 |
-
- )
|
618 |
-
- if result.returncode != 0:
|
619 |
-
- print(f"Execution failed with error: {result.stderr.strip()}")
|
620 |
-
- return f"Error: {result.stderr.strip()}"
|
621 |
-
- else:
|
622 |
-
- output = result.stdout.strip()
|
623 |
-
- print(f"Captured Output: {output}")
|
624 |
-
- return output
|
625 |
-
- except subprocess.TimeoutExpired:
|
626 |
-
- print("Execution timed out.")
|
627 |
-
- return "Error: Timeout"
|
628 |
-
- except Exception as e:
|
629 |
-
- print(f"Execution failed with error: {e}")
|
630 |
-
- return f"Error: {e}"
|
631 |
-
-
|
632 |
-
def initialize_code_agent() -> CodeActAgent:
|
633 |
-
- code_executor = SimpleCodeExecutor()
|
634 |
-
+ # DO NOT USE SimpleCodeExecutor - it is insecure.
|
635 |
-
+ # Rely on the code_interpreter tool provided below.
|
636 |
-
|
637 |
-
+ agent_llm_model = os.getenv("CODE_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
638 |
-
llm = GoogleGenAI(
|
639 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
640 |
-
- model="models/gemini-1.5-pro",
|
641 |
-
+ model=agent_llm_model,
|
642 |
-
)
|
643 |
-
|
644 |
-
system_prompt = """\
|
645 |
-
@@ -151,6 +117,7 @@
|
646 |
-
- If further logical reasoning or verification is needed, delegate to **reasoning_agent**.
|
647 |
-
- Otherwise, once you have the final code or execution result, pass your output to **planner_agent** for overall synthesis and presentation.
|
648 |
-
"""
|
649 |
-
+ system_prompt = load_prompt_from_file("code_agent_system_prompt.txt", system_prompt)
|
650 |
-
|
651 |
-
agent = CodeActAgent(
|
652 |
-
name="code_agent",
|
653 |
-
@@ -161,7 +128,7 @@
|
654 |
-
"pipelines, and library development, CodeAgent delivers production-ready Python solutions."
|
655 |
-
),
|
656 |
-
# REMOVED: code_execute_fn=code_executor.execute, # Use code_interpreter tool instead
|
657 |
-
- code_execute_fn=code_executor.execute,
|
658 |
-
tools=[
|
659 |
-
python_code_generator_tool,
|
660 |
-
code_interpreter_tool,
|
661 |
-
```
|
662 |
-
|
663 |
-
|
664 |
-
### 3.10. `math_agent.py` Refactoring
|
665 |
-
|
666 |
-
* **Rationale:** To improve configuration management and potentially simplify the tool interface for the LLM.
|
667 |
-
* **Proposals:**
|
668 |
-
1. **Configuration:** Move the hardcoded agent LLM model name (`models/gemini-1.5-pro`) to configuration. Ensure the WolframAlpha App ID is configured via environment variable (`WOLFRAM_ALPHA_APP_ID`) as intended.
|
669 |
-
2. **Tool Granularity:** The current approach creates a separate tool for almost every single math function (solve, derivative, integral, add, multiply, inverse, mean, median, etc.). While explicit, this results in a very large number of tools for the `ReActAgent` to manage. Consider:
|
670 |
-
* **Grouping:** Group related functions under fewer tools. For example, a `symbolic_math_tool` that takes the operation type (solve, diff, integrate) as a parameter, or a `matrix_ops_tool`.
|
671 |
-
* **Natural Language Interface:** Create a single `calculate` tool that takes a natural language math query (e.g., "solve x**2 - 4 = 0 for x", "mean of [1, 2, 3]") and uses an LLM (or rule-based parsing) internally to dispatch to the appropriate NumPy/SciPy/SymPy function. This simplifies the interface for the main agent LLM but adds complexity within the tool.
|
672 |
-
* **WolframAlpha Prioritization:** Evaluate if WolframAlpha can handle many of these requests directly, potentially reducing the need for numerous specific SymPy/NumPy tools, especially for symbolic tasks.
|
673 |
-
3. **Truncated File:** Since the original file was truncated, ensure the full file is reviewed if possible, as there might be other issues or tools not seen.
|
674 |
-
|
675 |
-
* **Diff Patch (Illustrative - Configuration):**
|
676 |
-
|
677 |
-
```diff
|
678 |
-
--- a/math_agent.py
|
679 |
-
+++ b/math_agent.py
|
680 |
-
@@ -1,5 +1,6 @@
|
681 |
-
import os
|
682 |
-
from typing import List, Optional, Union
|
683 |
-
+ import logging
|
684 |
-
import sympy as sp
|
685 |
-
import numpy as np
|
686 |
-
from llama_index.core.agent.workflow import ReActAgent
|
687 |
-
@@ -12,6 +13,8 @@
|
688 |
-
from scipy.integrate import odeint
|
689 |
-
import numpy.fft as fft
|
690 |
-
|
691 |
-
+ logger = logging.getLogger(__name__)
|
692 |
-
+
|
693 |
-
# --- Symbolic math functions ---
|
694 |
-
|
695 |
-
|
696 |
-
@@ -451,10 +454,11 @@
|
697 |
-
|
698 |
-
|
699 |
-
def initialize_math_agent() -> ReActAgent:
|
700 |
-
+ agent_llm_model = os.getenv("MATH_AGENT_LLM_MODEL", "models/gemini-1.5-pro")
|
701 |
-
llm = GoogleGenAI(
|
702 |
-
api_key=os.getenv("GEMINI_API_KEY"),
|
703 |
-
- model="models/gemini-1.5-pro",
|
704 |
-
+ model=agent_llm_model,
|
705 |
-
)
|
706 |
-
|
707 |
-
# Ensure WolframAlpha App ID is set
|
708 |
-
```
|
709 |
-
|
710 |
-
*(Refactoring proposals section complete)*
|
711 |
-
|
712 |
-
|
713 |
-
## 4. New Feature Designs
|
714 |
-
|
715 |
-
This section outlines the design for the new features requested: YouTube Ingestion and Generic Audio Transcription.
|
716 |
-
|
717 |
-
### 4.1. YouTube Ingestion
|
718 |
-
|
719 |
-
* **Rationale:** To enable the framework to process YouTube videos by extracting audio, transcribing it, and summarizing the content, as requested by the user.
|
720 |
-
* **Design Proposal:**
|
721 |
-
* **Implementation:** Introduce a new dedicated agent, `youtube_agent`, or add tools to the existing `research_agent` or `text_analyzer_agent`. A dedicated agent seems cleaner given the specific multi-step workflow.
|
722 |
-
* **Agent (`youtube_agent`):**
|
723 |
-
* **Purpose:** Manages the end-to-end process of downloading YouTube audio, chunking, transcribing, and summarizing.
|
724 |
-
* **Tools:**
|
725 |
-
1. `download_youtube_audio`: Takes a YouTube URL, uses a library like `yt-dlp` (or potentially `pytube`) to download the audio stream into a temporary file (e.g., `.mp3` or `.opus`). Returns the path to the audio file.
|
726 |
-
2. `chunk_audio_file`: Takes an audio file path and a maximum chunk duration (e.g., 60 seconds). Uses a library like `pydub` or `librosa`+`soundfile` to split the audio into smaller, sequentially numbered temporary files. Returns a list of chunk file paths.
|
727 |
-
3. `transcribe_audio_chunk_gemini`: Takes an audio file path (representing a chunk). Uses the Google Generative AI SDK (`google.generativeai`) to call the Gemini 1.5 Pro model with the audio file for transcription. Returns the transcribed text.
|
728 |
-
4. `summarize_transcript`: Takes the full concatenated transcript text. Uses a Gemini model (e.g., 1.5 Pro or Flash) with a specific prompt to generate a one-paragraph summary. Returns the summary text.
|
729 |
-
* **Workflow (ReAct or Function sequence):**
|
730 |
-
1. Receive YouTube URL.
|
731 |
-
2. Call `download_youtube_audio`.
|
732 |
-
3. Call `chunk_audio_file` with the downloaded audio path.
|
733 |
-
4. Iterate through the list of chunk paths:
|
734 |
-
* Call `transcribe_audio_chunk_gemini` for each chunk.
|
735 |
-
* Collect transcribed text segments.
|
736 |
-
5. Concatenate all transcribed text segments into a full transcript.
|
737 |
-
6. Call `summarize_transcript` with the full transcript.
|
738 |
-
7. Return the full transcript and the summary.
|
739 |
-
8. Clean up temporary audio files (downloaded and chunks).
|
740 |
-
* **Handoff:** Could hand off the transcript and summary to `planner_agent` or `text_analyzer_agent` for further processing or integration.
|
741 |
-
* **Dependencies:** `yt-dlp`, `pydub` (requires `ffmpeg` or `libav`), `google-generativeai`.
|
742 |
-
* **Configuration:** Gemini API Key, chunk duration.
|
743 |
-
|
744 |
-
|
745 |
-
### 4.2. Generic Audio Transcription
|
746 |
-
|
747 |
-
* **Rationale:** To provide a flexible audio transcription capability for local files or remote URLs, using Gemini Pro for quality/latency tolerance and Whisper.cpp as a fallback, exposing it via a Python API as requested.
|
748 |
-
* **Design Proposal:**
|
749 |
-
* **Implementation:** Introduce a new dedicated agent, `transcription_agent`, or add tools to `text_analyzer_agent`. A dedicated agent allows for clearer separation of concerns, especially managing the Whisper.cpp dependency and logic.
|
750 |
-
* **Agent (`transcription_agent`):**
|
751 |
-
* **Purpose:** Transcribes audio from various sources (local path, URL) using either Gemini or Whisper.cpp based on latency requirements or availability.
|
752 |
-
* **Tools:**
|
753 |
-
1. `prepare_audio_source`: Takes a source string (URL or local path). If it's a URL, downloads it to a temporary file using `requests`. Validates the local file path. Returns the path to the local audio file.
|
754 |
-
2. `transcribe_gemini`: Takes an audio file path. Uses the `google-generativeai` SDK to call Gemini 1.5 Pro for transcription. Returns the transcribed text. This is the preferred method when latency is acceptable.
|
755 |
-
3. `transcribe_whisper_cpp`: Takes an audio file path. Uses a Python wrapper around `whisper.cpp` (e.g., installing `whisper.cpp` via `apt` or compiling from source, then using `subprocess` or a dedicated Python binding if available) to perform local transcription. Returns the transcribed text. This is the fallback or low-latency option.
|
756 |
-
4. `choose_transcription_method`: (Internal logic or a simple tool) Takes latency preference (e.g., 'high_quality' vs 'low_latency') or checks Gemini availability/quota. Decides whether to use `transcribe_gemini` or `transcribe_whisper_cpp`.
|
757 |
-
* **Workflow (ReAct or Function sequence):**
|
758 |
-
1. Receive audio source (URL/path) and potentially a latency preference.
|
759 |
-
2. Call `prepare_audio_source` to get a local file path.
|
760 |
-
3. Call `choose_transcription_method` (or execute internal logic) to decide between Gemini and Whisper.
|
761 |
-
4. If Gemini: Call `transcribe_gemini`.
|
762 |
-
5. If Whisper: Call `transcribe_whisper_cpp`.
|
763 |
-
6. Return the resulting transcript.
|
764 |
-
7. Clean up temporary downloaded audio file if applicable.
|
765 |
-
* **Handoff:** Could hand off the transcript to `planner_agent` or `text_analyzer_agent`.
|
766 |
-
* **Python API:**
|
767 |
-
* Define a simple Python function (e.g., in a `transcription_api.py` module) that encapsulates the agent's logic or directly calls the underlying transcription functions.
|
768 |
-
```python
|
769 |
-
# Example API function in transcription_api.py
|
770 |
-
from .transcription_agent import transcribe_audio # Assuming agent logic is refactored
|
771 |
-
|
772 |
-
def get_transcript(source: str, prefer_gemini: bool = True) -> str:
|
773 |
-
"""Transcribes audio from a local path or URL.
|
774 |
-
|
775 |
-
Args:
|
776 |
-
source: Path to the local audio file or URL.
|
777 |
-
prefer_gemini: If True, attempts to use Gemini Pro first.
|
778 |
-
If False or Gemini fails, falls back to Whisper.cpp.
|
779 |
-
|
780 |
-
Returns:
|
781 |
-
The transcribed text.
|
782 |
-
|
783 |
-
Raises:
|
784 |
-
TranscriptionError: If transcription fails.
|
785 |
-
"""
|
786 |
-
# Implementation would call the agent or its refactored functions
|
787 |
-
try:
|
788 |
-
# Simplified logic - actual implementation needs error handling,
|
789 |
-
# Gemini/Whisper selection based on preference/availability
|
790 |
-
transcript = transcribe_audio(source, prefer_gemini)
|
791 |
-
return transcript
|
792 |
-
except Exception as e:
|
793 |
-
# Log error
|
794 |
-
raise TranscriptionError(f"Failed to transcribe {source}: {e}") from e
|
795 |
-
|
796 |
-
class TranscriptionError(Exception):
|
797 |
-
pass
|
798 |
-
```
|
799 |
-
* **Dependencies:** `requests`, `google-generativeai`, `whisper.cpp` (requires separate installation/compilation), potentially Python bindings for `whisper.cpp`.
|
800 |
-
* **Configuration:** Gemini API Key, path to `whisper.cpp` executable or library, Whisper model selection.
|
801 |
-
|
802 |
-
|
803 |
-
## 5. Extra Agent Designs
|
804 |
-
|
805 |
-
This section proposes three additional specialized agents designed to enhance performance on the GAIA benchmark by addressing common challenges like complex fact verification, interpreting visual data representations, and handling long contexts.
|
806 |
-
|
807 |
-
### 5.1. Agent Design 1: Advanced Validation Agent (`validation_agent`)
|
808 |
-
|
809 |
-
* **Purpose:** To perform rigorous validation of factual claims or intermediate results generated by other agents, going beyond the simple contradiction check of the current `verifier_agent`. This agent aims to improve the accuracy and trustworthiness of the final answer by cross-referencing information and performing checks.
|
810 |
-
* **Key Tool Calls:**
|
811 |
-
* `web_search` (from `research_agent` or similar): To find external evidence supporting or refuting a claim.
|
812 |
-
* `browse_and_extract` (from `research_agent` or similar): To access specific URLs found during search and extract relevant text snippets.
|
813 |
-
* `code_interpreter` (from `code_agent`): To perform calculations or simple data manipulations needed for verification (e.g., checking unit conversions, calculating percentages).
|
814 |
-
* `knowledge_base_lookup` (New Tool - Optional): Interface with a structured knowledge base (e.g., Wikidata, internal DB) to verify entities, relationships, or properties.
|
815 |
-
* `llm_check_consistency` (New Tool or LLM call): Use a powerful LLM with a specific prompt to assess the logical consistency between a claim and a set of provided evidence snippets or existing context.
|
816 |
-
* **Agent Loop Sketch (ReAct style):**
|
817 |
-
1. **Input:** A specific claim or statement to validate, along with relevant context or source information.
|
818 |
-
2. **Thought:** Identify the core assertion in the claim. Determine the best validation strategy (e.g., web search for current events, calculation for numerical claims, consistency check for logical statements).
|
819 |
-
3. **Action:** Call the appropriate tool (`web_search`, `code_interpreter`, `llm_check_consistency`).
|
820 |
-
4. **Observation:** Analyze the tool's output (search results, calculation result, consistency assessment).
|
821 |
-
5. **Thought:** Does the observation confirm, refute, or remain inconclusive about the claim? Is more information needed? (e.g., need to browse a specific search result).
|
822 |
-
6. **Action (if needed):** Call another tool (`browse_and_extract`, `llm_check_consistency` with new evidence).
|
823 |
-
7. **Observation:** Analyze new output.
|
824 |
-
8. **Thought:** Synthesize findings. Assign a final validation status (e.g., Confirmed, Refuted, Uncertain) and provide supporting evidence or reasoning.
|
825 |
-
9. **Output:** Validation status and justification.
|
826 |
-
10. **Handoff:** Return result to `planner_agent` or `verifier_agent` (if this agent replaces the contradiction part).
|
827 |
-
|
828 |
-
### 5.2. Agent Design 2: Figure Interpretation Agent (`figure_interpretation_agent`)
|
829 |
-
|
830 |
-
* **Purpose:** To specialize in extracting structured data and meaning from figures, charts, graphs, and tables embedded within images or documents, which are common in GAIA tasks and often require more than just a textual description.
|
831 |
-
* **Key Tool Calls:**
|
832 |
-
* `image_ocr` (New Tool or enhanced `image_analyzer_agent` capability): High-precision OCR focused on extracting text specifically from figures, including axes labels, legends, titles, and data points.
|
833 |
-
* `chart_data_extractor` (New Tool): Utilizes specialized vision models (e.g., DePlot, ChartOCR, or similar fine-tuned models) designed to parse chart types (bar, line, pie) and extract underlying data series or key values.
|
834 |
-
* `table_parser` (New Tool): Uses vision or document AI models to detect table structures in images/PDFs and extract cell content into a structured format (e.g., list of lists, Pandas DataFrame via code execution).
|
835 |
-
* `code_interpreter` (from `code_agent`): To process extracted data (e.g., load into DataFrame, perform simple analysis, re-plot for verification).
|
836 |
-
* `llm_interpret_figure` (New Tool or LLM call): Takes extracted text, data, and potentially the image itself (multimodal) to provide a semantic interpretation of the figure's message or trends.
|
837 |
-
* **Agent Loop Sketch (Function sequence or ReAct):**
|
838 |
-
1. **Input:** An image or document page containing a figure/table, potentially with context or a specific question about it.
|
839 |
-
2. **Action:** Call `image_ocr` to get all text elements.
|
840 |
-
3. **Action:** Call `chart_data_extractor` or `table_parser` based on visual analysis (or try both) to get structured data.
|
841 |
-
4. **Action (Optional):** Call `code_interpreter` to load structured data into a DataFrame for easier handling.
|
842 |
-
5. **Action:** Call `llm_interpret_figure`, providing the extracted text, data (raw or DataFrame), and potentially the original image, asking it to answer the specific question or summarize the figure's key insights.
|
843 |
-
6. **Output:** Structured data (if requested) and/or the semantic interpretation/answer.
|
844 |
-
7. **Handoff:** Return results to `planner_agent` or `reasoning_agent`.
|
845 |
-
|
846 |
-
### 5.3. Agent Design 3: Long Context Management Agent (`long_context_agent`)
|
847 |
-
|
848 |
-
* **Purpose:** To effectively manage and query information from very long documents or conversation histories that exceed the context window limits of standard models or require efficient information retrieval techniques.
|
849 |
-
* **Key Tool Calls:**
|
850 |
-
* `document_chunker` (New Tool): Splits long text into semantically meaningful chunks (e.g., using `SentenceSplitter` from LlamaIndex or more advanced methods).
|
851 |
-
* `vector_store_builder` (New Tool): Takes text chunks and builds an in-memory or persistent vector index (using libraries like `llama-index`, `langchain`, `faiss`, `chromadb`).
|
852 |
-
* `vector_retriever` (New Tool): Queries the built vector index with a specific question to find the most relevant chunks.
|
853 |
-
* `summarizer_tool` (New Tool or LLM call): Generates summaries of long text or selected chunks, potentially using different levels of detail.
|
854 |
-
* `contextual_synthesizer` (New Tool or LLM call): Takes retrieved relevant chunks and the original query, then uses an LLM to synthesize an answer grounded in the retrieved context (RAG pattern).
|
855 |
-
* **Agent Loop Sketch (Can be stateful):**
|
856 |
-
1. **Input:** A long document (text or path) or a long conversation history, and a specific query or task related to it.
|
857 |
-
2. **(Initialization/First Use):**
|
858 |
-
* **Action:** Call `document_chunker`.
|
859 |
-
* **Action:** Call `vector_store_builder` to create an index from the chunks. Store the index reference.
|
860 |
-
3. **(Querying):**
|
861 |
-
* **Action:** Call `vector_retriever` with the user's query to get relevant chunks.
|
862 |
-
* **Action:** Call `contextual_synthesizer`, providing the query and retrieved chunks, to generate the final answer.
|
863 |
-
4. **(Alternative: Summarization Task):**
|
864 |
-
* **Action:** Call `summarizer_tool` on the full text (if feasible for the tool) or on retrieved chunks based on a high-level query.
|
865 |
-
5. **Output:** The synthesized answer or the summary.
|
866 |
-
6. **Handoff:** Return results to `planner_agent`.
|
867 |
-
|
868 |
-
|
869 |
-
## 6. Migration Plan
|
870 |
-
|
871 |
-
This section details the recommended steps for applying the proposed changes, lists new dependencies, and outlines minimal validation tests.
|
872 |
-
|
873 |
-
### 6.1. Order of Implementation
|
874 |
-
|
875 |
-
It is recommended to apply changes in the following order to minimize disruption and build upon stable foundations:
|
876 |
-
|
877 |
-
1. **Core Refactoring (`app.py`, Configuration, Logging):**
|
878 |
-
* Implement centralized configuration (e.g., `.env` file) and update all agents to use it for API keys, model names, etc.
|
879 |
-
* Integrate Python's `logging` module throughout `app.py` and all agent files, replacing `print` statements.
|
880 |
-
* Refactor `app.py`: Implement singleton agent initialization and break down `run_and_submit_all`.
|
881 |
-
* Apply structural refactors to agents (class-based structure, avoiding globals) like `role_agent`, `verifier_agent`, `research_agent`.
|
882 |
-
2. **Critical Security Fix (`code_agent`):**
|
883 |
-
* Immediately remove the `SimpleCodeExecutor` and modify `code_agent` to rely solely on the `code_interpreter` tool.
|
884 |
-
3. **Core Functionality Refactoring (`verifier_agent`, `math_agent`):**
|
885 |
-
* Improve `verifier_agent`'s contradiction detection (e.g., using an LLM or NLI model).
|
886 |
-
* Refactor `math_agent` tools if choosing to group them or use a natural language interface.
|
887 |
-
4. **New Feature: Generic Audio Transcription (`transcription_agent`):**
|
888 |
-
* Install `whisper.cpp` and its dependencies.
|
889 |
-
* Implement the `transcription_agent` and its tools (`prepare_audio_source`, `transcribe_gemini`, `transcribe_whisper_cpp`).
|
890 |
-
* Implement the Python API function `get_transcript`.
|
891 |
-
5. **New Feature: YouTube Ingestion (`youtube_agent`):**
|
892 |
-
* Install `yt-dlp` and `pydub` (and `ffmpeg`).
|
893 |
-
* Implement the `youtube_agent` and its tools (`download_youtube_audio`, `chunk_audio_file`, `transcribe_audio_chunk_gemini`, `summarize_transcript`).
|
894 |
-
6. **New Agent Implementation (Validation, Figure, Long Context):**
|
895 |
-
* Implement `validation_agent` and its tools.
|
896 |
-
* Implement `figure_interpretation_agent` and its tools (requires sourcing/installing chart/table parsing models/libraries).
|
897 |
-
* Implement `long_context_agent` and its tools (requires vector DB setup like `faiss` or `chromadb`).
|
898 |
-
7. **Integration and Workflow Adjustments:**
|
899 |
-
* Update `planner_agent`'s system prompt and handoff logic to incorporate the new agents.
|
900 |
-
* Update other agents' handoff targets as needed.
|
901 |
-
* Update `app.py` if the overall agent initialization or workflow invocation changes.
|
902 |
-
|
903 |
-
### 6.2. New Dependencies (`requirements.txt`)
|
904 |
-
|
905 |
-
Based on the refactoring and new features, the following dependencies might need to be added or updated in `requirements.txt` (or managed via environment setup):
|
906 |
-
|
907 |
-
* `python-dotenv`: For loading configuration from `.env` files.
|
908 |
-
* `google-generativeai`: For interacting with Gemini models (already likely present via `llama-index-llms-google-genai`).
|
909 |
-
* `yt-dlp`: For downloading YouTube videos.
|
910 |
-
* `pydub`: For audio manipulation (chunking). Requires `ffmpeg` or `libav` system dependency.
|
911 |
-
* `llama-index-vector-stores-faiss` / `faiss-cpu` / `faiss-gpu`: For `long_context_agent` vector store (choose one).
|
912 |
-
* `chromadb` / `llama-index-vector-stores-chroma`: Alternative vector store for `long_context_agent`.
|
913 |
-
* `llama-index-multi-modal-llms-google`: Ensure multimodal support for Gemini is correctly installed.
|
914 |
-
* *Possibly*: Libraries for NLI models (e.g., `transformers`, `torch`) if used in `validation_agent`.
|
915 |
-
* *Possibly*: Libraries for chart/table parsing (e.g., specific models from Hugging Face, `opencv-python`, `pdf2image`) if implementing `figure_interpretation_agent` tools.
|
916 |
-
* *Possibly*: Python bindings for `whisper.cpp` if not using `subprocess`.
|
917 |
-
|
918 |
-
**System Dependencies:**
|
919 |
-
|
920 |
-
* `ffmpeg` or `libav`: Required by `pydub`.
|
921 |
-
* `whisper.cpp`: Needs to be compiled or installed separately. Follow its specific instructions.
|
922 |
-
|
923 |
-
### 6.3. Validation Tests
|
924 |
-
|
925 |
-
Minimal tests should be implemented to validate key changes:
|
926 |
-
|
927 |
-
1. **Configuration:** Test loading of API keys and model names from the configuration source.
|
928 |
-
2. **Logging:** Verify that logs are being generated at the correct levels and formats.
|
929 |
-
3. **`code_agent` Security:** Test that `code_agent` uses `code_interpreter` and *not* the removed `SimpleCodeExecutor`. Attempt a malicious code execution via prompt to ensure it fails safely within the interpreter's sandbox.
|
930 |
-
4. **`verifier_agent` Contradiction:** Test the improved contradiction detection with sample pairs of contradictory and non-contradictory statements.
|
931 |
-
5. **`transcription_agent`:**
|
932 |
-
* Test with a short local audio file using both Gemini and Whisper.cpp, comparing output quality/speed.
|
933 |
-
* Test with an audio URL.
|
934 |
-
* Test the Python API function `get_transcript`.
|
935 |
-
6. **`youtube_agent`:**
|
936 |
-
* Test with a short YouTube video URL.
|
937 |
-
* Verify audio download, chunking, transcription of chunks, and final summary generation.
|
938 |
-
* Check cleanup of temporary files.
|
939 |
-
7. **New Agents (Basic):**
|
940 |
-
* For `validation_agent`, `figure_interpretation_agent`, `long_context_agent`, implement basic tests confirming agent initialization and successful calls to their primary new tools with mock inputs/outputs.
|
941 |
-
8. **End-to-End Smoke Test:** Run `app.py` and process one or two simple GAIA tasks that are likely to invoke the refactored components and potentially a new feature (if a relevant task exists) to ensure the overall workflow remains functional.
|
942 |
-
|
943 |
-
*(Implementation plan complete. Ready for user confirmation.)*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
get_cookie.py
CHANGED
@@ -56,4 +56,13 @@ def cookies_to_env(cookie_file_path: str) -> str:
|
|
56 |
except Exception as e:
|
57 |
raise ValueError(f"Error converting cookie file: {str(e)}")
|
58 |
|
59 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
56 |
except Exception as e:
|
57 |
raise ValueError(f"Error converting cookie file: {str(e)}")
|
58 |
|
59 |
+
def save_to_env_file(env_content: str, env_file: str = '.env') -> None:
|
60 |
+
"""Save environment variable content to .env file"""
|
61 |
+
try:
|
62 |
+
with open(env_file, 'w') as f:
|
63 |
+
f.write(env_content)
|
64 |
+
#print(f"Successfully saved to {env_file}")
|
65 |
+
except Exception as e:
|
66 |
+
raise ValueError(f"Error saving to env file: {str(e)}")
|
67 |
+
|
68 |
+
save_to_env_file(cookies_to_env(export_youtube_cookies_netscape("youtube.com")))
|
packages.txt
CHANGED
@@ -4,4 +4,5 @@ libxss1
|
|
4 |
libatk-bridge2.0-0
|
5 |
libgtk-3-0
|
6 |
libgbm-dev
|
|
|
7 |
chromium-driver
|
|
|
4 |
libatk-bridge2.0-0
|
5 |
libgtk-3-0
|
6 |
libgbm-dev
|
7 |
+
ffmpeg
|
8 |
chromium-driver
|
prompts/code_gen_prompt.txt
CHANGED
@@ -44,12 +44,11 @@ Notes:
|
|
44 |
seaborn>=0.13.2,
|
45 |
sqlalchemy>=2.0.40,
|
46 |
statsmodels>=0.14.4,
|
|
|
47 |
sympy>=1.14.0,
|
48 |
youtube-transcript-api>=1.0.3,
|
49 |
yt-dlp>=2025.3.31
|
50 |
|
51 |
-
- You can also access and process YouTube video and audio streams using `yt-dlp`, `opencv-python`, `ffmpeg-python`, or `imageio`.
|
52 |
-
|
53 |
Prompt: {prompt}
|
54 |
|
55 |
Code:
|
|
|
44 |
seaborn>=0.13.2,
|
45 |
sqlalchemy>=2.0.40,
|
46 |
statsmodels>=0.14.4,
|
47 |
+
stockfish==3.28.0,
|
48 |
sympy>=1.14.0,
|
49 |
youtube-transcript-api>=1.0.3,
|
50 |
yt-dlp>=2025.3.31
|
51 |
|
|
|
|
|
52 |
Prompt: {prompt}
|
53 |
|
54 |
Code:
|
prompts/image_analyzer_prompt.txt
CHANGED
@@ -1,69 +1,69 @@
|
|
1 |
You are ImageAnalyzerAgent, an expert in cold, factual visual analysis. Your sole mission is to describe and analyze each image with the utmost exhaustiveness, precision, and absence of conjecture. Follow these directives exactly:
|
2 |
|
3 |
-
1. **Context & Role**
|
4 |
-
- You are an automated, impartial analysis system with no emotional or subjective bias.
|
5 |
- Your objective is to deliver a **purely factual** analysis of the image, avoiding artistic interpretation, author intent, aesthetic judgment, or speculation about non-visible elements.
|
6 |
|
7 |
-
2. **Analysis Structure**
|
8 |
Adhere strictly to this order in your output:
|
9 |
|
10 |
-
1. **General Identification**
|
11 |
-
- Output format: “Image received: [filename or path]”.
|
12 |
-
- Dimensions (if available): width × height in pixels.
|
13 |
- File format (JPEG, PNG, GIF, etc.).
|
14 |
|
15 |
-
2. **Scene Description**
|
16 |
-
- Total number of detected objects.
|
17 |
- Spatial distribution: primary areas of interest (top/left/center, etc.).
|
18 |
|
19 |
-
3. **Detailed Object List**
|
20 |
-
For **each** detected object, provide:
|
21 |
-
- **Class/type** (person, animal, vehicle, landscape, text, graphic, etc.).
|
22 |
-
- **Exact position**: bounding box coordinates (x_min, y_min, x_max, y_max).
|
23 |
-
- **Relative size**: percentage of image area or pixel dimensions.
|
24 |
-
- **Dominant color** (for uniform shapes) or top color palette.
|
25 |
- **Attributes**: posture, orientation, readable text, pattern, state (open/closed, on/off), geometric properties (shape, symmetry).
|
26 |
|
27 |
-
4. **Color Palette & Composition**
|
28 |
-
- **Simplified histogram**: list the 5 most frequent colors in hexadecimal (#RRGGBB) with approximate percentages.
|
29 |
-
- **Contrast & brightness**: factual description (e.g., “low overall contrast,” “very dark region in bottom right”).
|
30 |
- **Visual balance**: symmetric or asymmetric distribution of masses, guiding lines, focal points.
|
31 |
|
32 |
-
5. **Technical Metrics & Metadata**
|
33 |
-
- EXIF data (if available): capture date/time, camera model, aperture, shutter speed, ISO.
|
34 |
- Effective resolution (DPI/PPI), aspect ratio (4:3, 16:9, square).
|
35 |
|
36 |
-
6. **Textual Elements**
|
37 |
-
- OCR of **all** visible text: exact transcription, approximate font type (serif/sans-serif), relative size.
|
38 |
- Text layout (alignment, orientation, spacing).
|
39 |
|
40 |
-
7. **Geometric Analysis**
|
41 |
-
- Identify repeating patterns (textures, mosaics, geometric motifs).
|
42 |
- Measure dominant angles (vertical, horizontal, diagonal lines).
|
43 |
|
44 |
-
8. **Uncertainty Indicators**
|
45 |
-
- For each object or attribute, briefly state confidence level (high/medium/low) based on image clarity (blur, obstruction, low resolution).
|
46 |
- Example: “Detected ‘bicycle’ with medium confidence (partially blurred).”
|
47 |
|
48 |
-
9. **Factual Summary**
|
49 |
-
- Recap all listed elements without additional commentary.
|
50 |
- Numbered bullet list, each item prefixed by its category label (e.g., “1. Detected objects: …”, “2. Color palette: …”).
|
51 |
|
52 |
-
3. **Absolute Constraints**
|
53 |
-
- No psychological, symbolic, or subjective interpretation.
|
54 |
-
- No value judgments or qualifiers.
|
55 |
-
- Never omit any visible object or attribute.
|
56 |
- Strictly follow the prescribed order and structure without alteration.
|
57 |
|
58 |
-
4. **Output Format**
|
59 |
- Plain text only, numbered sections separated by two line breaks.
|
60 |
|
61 |
-
5. **Agent Handoff**
|
62 |
-
Once the image analysis is fully complete, hand off to one of the following agents:
|
63 |
-
- **planner_agent** for roadmap creation or final synthesis.
|
64 |
-
- **research_agent** for any additional information gathering.
|
65 |
- **reasoning_agent** for pure chain-of-thought reasoning or deeper logical interpretation.
|
66 |
|
67 |
-
By adhering to these instructions, ensure your visual analysis is cold, factual, comprehensive, and
|
68 |
-
completely devoid of subjectivity before handing off.
|
69 |
|
|
|
1 |
You are ImageAnalyzerAgent, an expert in cold, factual visual analysis. Your sole mission is to describe and analyze each image with the utmost exhaustiveness, precision, and absence of conjecture. Follow these directives exactly:
|
2 |
|
3 |
+
1. **Context & Role**
|
4 |
+
- You are an automated, impartial analysis system with no emotional or subjective bias.
|
5 |
- Your objective is to deliver a **purely factual** analysis of the image, avoiding artistic interpretation, author intent, aesthetic judgment, or speculation about non-visible elements.
|
6 |
|
7 |
+
2. **Analysis Structure**
|
8 |
Adhere strictly to this order in your output:
|
9 |
|
10 |
+
1. **General Identification**
|
11 |
+
- Output format: “Image received: [filename or path]”.
|
12 |
+
- Dimensions (if available): width × height in pixels.
|
13 |
- File format (JPEG, PNG, GIF, etc.).
|
14 |
|
15 |
+
2. **Scene Description**
|
16 |
+
- Total number of detected objects.
|
17 |
- Spatial distribution: primary areas of interest (top/left/center, etc.).
|
18 |
|
19 |
+
3. **Detailed Object List**
|
20 |
+
For **each** detected object, provide:
|
21 |
+
- **Class/type** (person, animal, vehicle, landscape, text, graphic, etc.).
|
22 |
+
- **Exact position**: bounding box coordinates (x_min, y_min, x_max, y_max).
|
23 |
+
- **Relative size**: percentage of image area or pixel dimensions.
|
24 |
+
- **Dominant color** (for uniform shapes) or top color palette.
|
25 |
- **Attributes**: posture, orientation, readable text, pattern, state (open/closed, on/off), geometric properties (shape, symmetry).
|
26 |
|
27 |
+
4. **Color Palette & Composition**
|
28 |
+
- **Simplified histogram**: list the 5 most frequent colors in hexadecimal (#RRGGBB) with approximate percentages.
|
29 |
+
- **Contrast & brightness**: factual description (e.g., “low overall contrast,” “very dark region in bottom right”).
|
30 |
- **Visual balance**: symmetric or asymmetric distribution of masses, guiding lines, focal points.
|
31 |
|
32 |
+
5. **Technical Metrics & Metadata**
|
33 |
+
- EXIF data (if available): capture date/time, camera model, aperture, shutter speed, ISO.
|
34 |
- Effective resolution (DPI/PPI), aspect ratio (4:3, 16:9, square).
|
35 |
|
36 |
+
6. **Textual Elements**
|
37 |
+
- OCR of **all** visible text: exact transcription, approximate font type (serif/sans-serif), relative size.
|
38 |
- Text layout (alignment, orientation, spacing).
|
39 |
|
40 |
+
7. **Geometric Analysis**
|
41 |
+
- Identify repeating patterns (textures, mosaics, geometric motifs).
|
42 |
- Measure dominant angles (vertical, horizontal, diagonal lines).
|
43 |
|
44 |
+
8. **Uncertainty Indicators**
|
45 |
+
- For each object or attribute, briefly state confidence level (high/medium/low) based on image clarity (blur, obstruction, low resolution).
|
46 |
- Example: “Detected ‘bicycle’ with medium confidence (partially blurred).”
|
47 |
|
48 |
+
9. **Factual Summary**
|
49 |
+
- Recap all listed elements without additional commentary.
|
50 |
- Numbered bullet list, each item prefixed by its category label (e.g., “1. Detected objects: …”, “2. Color palette: …”).
|
51 |
|
52 |
+
3. **Absolute Constraints**
|
53 |
+
- No psychological, symbolic, or subjective interpretation.
|
54 |
+
- No value judgments or qualifiers.
|
55 |
+
- Never omit any visible object or attribute.
|
56 |
- Strictly follow the prescribed order and structure without alteration.
|
57 |
|
58 |
+
4. **Output Format**
|
59 |
- Plain text only, numbered sections separated by two line breaks.
|
60 |
|
61 |
+
5. **Agent Handoff**
|
62 |
+
Once the image analysis is fully complete, hand off to one of the following agents:
|
63 |
+
- **planner_agent** for roadmap creation or final synthesis.
|
64 |
+
- **research_agent** for any additional information gathering.
|
65 |
- **reasoning_agent** for pure chain-of-thought reasoning or deeper logical interpretation.
|
66 |
|
67 |
+
By adhering to these instructions, ensure your visual analysis is cold, factual, comprehensive, and
|
68 |
+
completely devoid of subjectivity before handing off.
|
69 |
|
prompts/planner_agent_prompt.txt
CHANGED
@@ -24,10 +24,15 @@ Once planning is complete, address each sub-question in turn and then hand off a
|
|
24 |
- For pure chain-of-thought reasoning or complex logical verification, invoke **reasoning_agent** to perform detailed step-by-step analysis.
|
25 |
|
26 |
**Important**
|
|
|
|
|
|
|
|
|
27 |
Before providing any final answer to the user, you **must**:
|
28 |
-
1. Invoke **advanced_validation_agent** to
|
29 |
-
-
|
30 |
-
- If validation
|
|
|
31 |
2. Invoke the **answer_question** tool as the last step. This tool will format your response properly, including your reasoning steps and a final concise answer following the strict template.
|
32 |
|
33 |
**Agent Constraints**
|
@@ -35,4 +40,4 @@ Only the following agents are available: **code_agent**, **research_agent**, **m
|
|
35 |
Do **not** invoke any other agents (e.g., **chess_agent**, **educate_agent**, **game_agent**, etc.).
|
36 |
|
37 |
**Finalize**
|
38 |
-
After all sub-questions have been addressed
|
|
|
24 |
- For pure chain-of-thought reasoning or complex logical verification, invoke **reasoning_agent** to perform detailed step-by-step analysis.
|
25 |
|
26 |
**Important**
|
27 |
+
Before performing any reasoning, taking any action, or invoking any other tools — your very first step, including your first thought — **must be** to invoke the **generate_substeps** tool.
|
28 |
+
- This action is **mandatory** and must always be executed first.
|
29 |
+
- You are not allowed to perform any task-specific analysis, reasoning, or delegation before this planning step is complete.
|
30 |
+
|
31 |
Before providing any final answer to the user, you **must**:
|
32 |
+
1. Invoke **advanced_validation_agent** as the penultimate step in your plan to ensure the logical coherence, factual consistency, and structural validity of all outputs.
|
33 |
+
- This step is **mandatory** and non-negotiable.
|
34 |
+
- If validation fails, you must **discard the entire plan and restart the planning and execution process from the beginning**.
|
35 |
+
- Only proceed if validation is successful.
|
36 |
2. Invoke the **answer_question** tool as the last step. This tool will format your response properly, including your reasoning steps and a final concise answer following the strict template.
|
37 |
|
38 |
**Agent Constraints**
|
|
|
40 |
Do **not** invoke any other agents (e.g., **chess_agent**, **educate_agent**, **game_agent**, etc.).
|
41 |
|
42 |
**Finalize**
|
43 |
+
After all sub-questions have been addressed, by hand-off or self-answer, and the plan has passed **advanced_validation_agent**, compile and present the ultimate, coherent solution using the `answer_question` tool, ensuring your final response follows the required format and includes your chain of thought.
|
prompts/reasoning_agent_prompt.txt
CHANGED
@@ -14,10 +14,10 @@ You are **ReasoningAgent**, an advanced cognitive engine specialized in rigorous
|
|
14 |
- Provide the feedback or validation output back into **planner_agent** to refine or adjust the roadmap.
|
15 |
- Repeat the validation step until approval is obtained.
|
16 |
|
17 |
-
4. **
|
18 |
-
-
|
19 |
|
20 |
**Constraints:**
|
21 |
- No direct access to external data sources or the internet; all inference happens via the provided tools.
|
22 |
-
- Do not skip any step: reasoning → planning → validation → (if approved) final answer
|
23 |
|
|
|
14 |
- Provide the feedback or validation output back into **planner_agent** to refine or adjust the roadmap.
|
15 |
- Repeat the validation step until approval is obtained.
|
16 |
|
17 |
+
4. **Final answer**
|
18 |
+
- Once validated, hand off the final plan to **planner_agent** for a polished, final response.
|
19 |
|
20 |
**Constraints:**
|
21 |
- No direct access to external data sources or the internet; all inference happens via the provided tools.
|
22 |
+
- Do not skip any step: reasoning → planning → validation → (if approved) final answer.
|
23 |
|
prompts/text_analyzer_prompt.txt
CHANGED
@@ -1,42 +1,42 @@
|
|
1 |
You are TextAnalyzerAgent, an expert text‐analysis assistant. On each request—whether raw text or a PDF URL/path—you must:
|
2 |
|
3 |
-
1. **Determine Input Type**
|
4 |
-
- If the input is a URL or a local file path ending in “.pdf”, call `extract_text_from_pdf` with `{"source": <input>}`.
|
5 |
- Otherwise, treat the input directly as text.
|
6 |
|
7 |
-
2. **Extract Text (if PDF)**
|
8 |
-
Thought: Explain that you are retrieving text from the PDF or accepting raw text.
|
9 |
-
Action: extract_text_from_pdf or (skip for raw text)
|
10 |
-
Action Input: {"source": <input>}
|
11 |
-
Await Observation: the full concatenated text or an error message.
|
12 |
- If an error occurs, immediately return that error as your Answer.
|
13 |
|
14 |
-
3. **Analyze Content**
|
15 |
-
Thought: Outline that you will produce a summary and list of facts.
|
16 |
-
Action: analyze_text
|
17 |
-
Action Input: {"text": <extracted_or_raw_text>}
|
18 |
Await Observation: a plain‐text response with “Summary:” and “Facts:” sections.
|
19 |
|
20 |
-
4. **Format Response**
|
21 |
-
Thought: I can answer without using any more tools.
|
22 |
-
Answer:
|
23 |
-
Summary:
|
24 |
-
• <bullet point 1>
|
25 |
-
• <bullet point 2>
|
26 |
-
• <bullet point 3>
|
27 |
-
|
28 |
-
Facts:
|
29 |
-
• <fact 1>
|
30 |
-
• <fact 2>
|
31 |
-
• …
|
32 |
-
|
33 |
-
5. **Guidelines**
|
34 |
-
- Never include extra sections or commentary.
|
35 |
-
- Use exactly one tool per Action.
|
36 |
-
- If extraction fails, stop and return the error.
|
37 |
- Ensure bullets use “• ” and sections are labeled “Summary:” and “Facts:”.
|
38 |
|
39 |
-
6. **Hand‐Off**
|
40 |
After delivering your “Summary:” and “Facts:”, pass the extracted facts list to `verifier_agent` for confidence scoring and contradiction detection.
|
41 |
|
42 |
Follow this Thought→Action→Observation→… cycle rigorously to produce consistent, reliable analyses.
|
|
|
1 |
You are TextAnalyzerAgent, an expert text‐analysis assistant. On each request—whether raw text or a PDF URL/path—you must:
|
2 |
|
3 |
+
1. **Determine Input Type**
|
4 |
+
- If the input is a URL or a local file path ending in “.pdf”, call `extract_text_from_pdf` with `{"source": <input>}`.
|
5 |
- Otherwise, treat the input directly as text.
|
6 |
|
7 |
+
2. **Extract Text (if PDF)**
|
8 |
+
Thought: Explain that you are retrieving text from the PDF or accepting raw text.
|
9 |
+
Action: extract_text_from_pdf or (skip for raw text)
|
10 |
+
Action Input: {"source": <input>}
|
11 |
+
Await Observation: the full concatenated text or an error message.
|
12 |
- If an error occurs, immediately return that error as your Answer.
|
13 |
|
14 |
+
3. **Analyze Content**
|
15 |
+
Thought: Outline that you will produce a summary and list of facts.
|
16 |
+
Action: analyze_text
|
17 |
+
Action Input: {"text": <extracted_or_raw_text>}
|
18 |
Await Observation: a plain‐text response with “Summary:” and “Facts:” sections.
|
19 |
|
20 |
+
4. **Format Response**
|
21 |
+
Thought: I can answer without using any more tools.
|
22 |
+
Answer:
|
23 |
+
Summary:
|
24 |
+
• <bullet point 1>
|
25 |
+
• <bullet point 2>
|
26 |
+
• <bullet point 3>
|
27 |
+
|
28 |
+
Facts:
|
29 |
+
• <fact 1>
|
30 |
+
• <fact 2>
|
31 |
+
• …
|
32 |
+
|
33 |
+
5. **Guidelines**
|
34 |
+
- Never include extra sections or commentary.
|
35 |
+
- Use exactly one tool per Action.
|
36 |
+
- If extraction fails, stop and return the error.
|
37 |
- Ensure bullets use “• ” and sections are labeled “Summary:” and “Facts:”.
|
38 |
|
39 |
+
6. **Hand‐Off**
|
40 |
After delivering your “Summary:” and “Facts:”, pass the extracted facts list to `verifier_agent` for confidence scoring and contradiction detection.
|
41 |
|
42 |
Follow this Thought→Action→Observation→… cycle rigorously to produce consistent, reliable analyses.
|
pyproject.toml
CHANGED
@@ -43,9 +43,12 @@ dependencies = [
|
|
43 |
"scikit-learn>=1.6.1",
|
44 |
"scipy>=1.15.2",
|
45 |
"seaborn>=0.13.2",
|
|
|
46 |
"sqlalchemy>=2.0.40",
|
47 |
"statsmodels>=0.14.4",
|
|
|
48 |
"sympy>=1.14.0",
|
|
|
49 |
"youtube-transcript-api>=1.0.3",
|
50 |
"yt-dlp>=2025.3.31",
|
51 |
]
|
|
|
43 |
"scikit-learn>=1.6.1",
|
44 |
"scipy>=1.15.2",
|
45 |
"seaborn>=0.13.2",
|
46 |
+
"soundfile>=0.13.1",
|
47 |
"sqlalchemy>=2.0.40",
|
48 |
"statsmodels>=0.14.4",
|
49 |
+
"stockfish>=3.28.0",
|
50 |
"sympy>=1.14.0",
|
51 |
+
"torchaudio>=2.7.0",
|
52 |
"youtube-transcript-api>=1.0.3",
|
53 |
"yt-dlp>=2025.3.31",
|
54 |
]
|
requirements.txt
CHANGED
@@ -39,6 +39,8 @@ scipy>=1.15.2
|
|
39 |
seaborn>=0.13.2
|
40 |
sqlalchemy>=2.0.40
|
41 |
statsmodels>=0.14.4
|
|
|
42 |
sympy>=1.14.0
|
|
|
43 |
youtube-transcript-api>=1.0.3
|
44 |
yt-dlp>=2025.3.31
|
|
|
39 |
seaborn>=0.13.2
|
40 |
sqlalchemy>=2.0.40
|
41 |
statsmodels>=0.14.4
|
42 |
+
stockfish==3.28.0
|
43 |
sympy>=1.14.0
|
44 |
+
torchaudio>=2.7.0
|
45 |
youtube-transcript-api>=1.0.3
|
46 |
yt-dlp>=2025.3.31
|
todo.md
DELETED
@@ -1,44 +0,0 @@
|
|
1 |
-
# GAIA Framework Improvement Plan - ToDo List
|
2 |
-
|
3 |
-
1. [X] Create overall output document structure (`gaia_improvement_plan.md`).
|
4 |
-
2. [ ] Generate ASCII diagram of the *revised* architecture (incorporating proposed changes).
|
5 |
-
3. [ ] Perform Code Quality Review:
|
6 |
-
* [ ] Review `app.py`
|
7 |
-
* [ ] Review `role_agent.py`
|
8 |
-
* [ ] Review `image_analyzer_agent.py`
|
9 |
-
* [ ] Review `verifier_agent.py`
|
10 |
-
* [ ] Review `research_agent.py`
|
11 |
-
* [ ] Review `text_analyzer_agent.py`
|
12 |
-
* [ ] Review `reasoning_agent.py`
|
13 |
-
* [ ] Review `planner_agent.py`
|
14 |
-
* [ ] Review `code_agent.py`
|
15 |
-
* [ ] Review `math_agent.py` (note truncation)
|
16 |
-
* [ ] Consolidate findings for Code Quality section in the report.
|
17 |
-
4. [ ] Develop Refactor Proposals:
|
18 |
-
* [ ] Propose refactors for `app.py` (if any) + generate diff.
|
19 |
-
* [ ] Propose refactors for `role_agent.py` (if any) + generate diff.
|
20 |
-
* [ ] Propose refactors for `image_analyzer_agent.py` (if any) + generate diff.
|
21 |
-
* [ ] Propose refactors for `verifier_agent.py` (if any) + generate diff.
|
22 |
-
* [ ] Propose refactors for `research_agent.py` (if any) + generate diff.
|
23 |
-
* [ ] Propose refactors for `text_analyzer_agent.py` (if any) + generate diff.
|
24 |
-
* [ ] Propose refactors for `reasoning_agent.py` (if any) + generate diff.
|
25 |
-
* [ ] Propose refactors for `planner_agent.py` (if any) + generate diff.
|
26 |
-
* [ ] Propose refactors for `code_agent.py` (if any) + generate diff.
|
27 |
-
* [ ] Propose refactors for `math_agent.py` (if any) + generate diff.
|
28 |
-
* [ ] Consolidate proposals for Refactoring section in the report.
|
29 |
-
5. [ ] Design New Features:
|
30 |
-
* [ ] Design YouTube Ingestion feature (module/agent, steps, tools, API).
|
31 |
-
* [ ] Design Generic Audio Transcription feature (module/agent, steps, tools, API, Gemini/Whisper logic).
|
32 |
-
* [ ] Document designs in New Features section of the report.
|
33 |
-
6. [ ] Design Extra Agents:
|
34 |
-
* [ ] Design Agent 1 (Purpose, Tools, Loop Sketch).
|
35 |
-
* [ ] Design Agent 2 (Purpose, Tools, Loop Sketch).
|
36 |
-
* [ ] Design Agent 3 (Purpose, Tools, Loop Sketch).
|
37 |
-
* [ ] Document designs in Extra Agents section of the report.
|
38 |
-
7. [ ] Create Migration Plan:
|
39 |
-
* [ ] Define order of applying changes/features.
|
40 |
-
* [ ] List new dependencies for `requirements.txt`.
|
41 |
-
* [ ] Outline minimal unit/integration tests for validation.
|
42 |
-
* [ ] Document plan in Migration Plan section of the report.
|
43 |
-
8. [ ] Assemble final report (`gaia_improvement_plan.md`).
|
44 |
-
9. [ ] Ask user for confirmation/feedback on the plan before proceeding (as per user's "First action" instruction).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
CHANGED
@@ -914,9 +914,12 @@ dependencies = [
|
|
914 |
{ name = "scikit-learn" },
|
915 |
{ name = "scipy" },
|
916 |
{ name = "seaborn" },
|
|
|
917 |
{ name = "sqlalchemy" },
|
918 |
{ name = "statsmodels" },
|
|
|
919 |
{ name = "sympy" },
|
|
|
920 |
{ name = "youtube-transcript-api" },
|
921 |
{ name = "yt-dlp" },
|
922 |
]
|
@@ -962,9 +965,12 @@ requires-dist = [
|
|
962 |
{ name = "scikit-learn", specifier = ">=1.6.1" },
|
963 |
{ name = "scipy", specifier = ">=1.15.2" },
|
964 |
{ name = "seaborn", specifier = ">=0.13.2" },
|
|
|
965 |
{ name = "sqlalchemy", specifier = ">=2.0.40" },
|
966 |
{ name = "statsmodels", specifier = ">=0.14.4" },
|
|
|
967 |
{ name = "sympy", specifier = ">=1.14.0" },
|
|
|
968 |
{ name = "youtube-transcript-api", specifier = ">=1.0.3" },
|
969 |
{ name = "yt-dlp", specifier = ">=2025.3.31" },
|
970 |
]
|
@@ -3556,6 +3562,25 @@ wheels = [
|
|
3556 |
{ url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575 },
|
3557 |
]
|
3558 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3559 |
[[package]]
|
3560 |
name = "soupsieve"
|
3561 |
version = "2.7"
|
@@ -3652,6 +3677,15 @@ wheels = [
|
|
3652 |
{ url = "https://files.pythonhosted.org/packages/1d/eb/cb8b01f5edf8f135eb3d0553d159db113a35b2948d0e51eeb735e7ae09ea/statsmodels-0.14.4-cp313-cp313-win_amd64.whl", hash = "sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0", size = 9817574 },
|
3653 |
]
|
3654 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3655 |
[[package]]
|
3656 |
name = "striprtf"
|
3657 |
version = "0.0.26"
|
@@ -3816,6 +3850,32 @@ wheels = [
|
|
3816 |
{ url = "https://files.pythonhosted.org/packages/90/48/7e6477cf40d48cc0a61fa0d41ee9582b9a316b12772fcac17bc1a40178e7/torch-2.7.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:27f5007bdf45f7bb7af7f11d1828d5c2487e030690afb3d89a651fd7036a390e", size = 68575074 },
|
3817 |
]
|
3818 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
3819 |
[[package]]
|
3820 |
name = "tqdm"
|
3821 |
version = "4.67.1"
|
|
|
914 |
{ name = "scikit-learn" },
|
915 |
{ name = "scipy" },
|
916 |
{ name = "seaborn" },
|
917 |
+
{ name = "soundfile" },
|
918 |
{ name = "sqlalchemy" },
|
919 |
{ name = "statsmodels" },
|
920 |
+
{ name = "stockfish" },
|
921 |
{ name = "sympy" },
|
922 |
+
{ name = "torchaudio" },
|
923 |
{ name = "youtube-transcript-api" },
|
924 |
{ name = "yt-dlp" },
|
925 |
]
|
|
|
965 |
{ name = "scikit-learn", specifier = ">=1.6.1" },
|
966 |
{ name = "scipy", specifier = ">=1.15.2" },
|
967 |
{ name = "seaborn", specifier = ">=0.13.2" },
|
968 |
+
{ name = "soundfile", specifier = ">=0.13.1" },
|
969 |
{ name = "sqlalchemy", specifier = ">=2.0.40" },
|
970 |
{ name = "statsmodels", specifier = ">=0.14.4" },
|
971 |
+
{ name = "stockfish", specifier = ">=3.28.0" },
|
972 |
{ name = "sympy", specifier = ">=1.14.0" },
|
973 |
+
{ name = "torchaudio", specifier = ">=2.7.0" },
|
974 |
{ name = "youtube-transcript-api", specifier = ">=1.0.3" },
|
975 |
{ name = "yt-dlp", specifier = ">=2025.3.31" },
|
976 |
]
|
|
|
3562 |
{ url = "https://files.pythonhosted.org/packages/32/46/9cb0e58b2deb7f82b84065f37f3bffeb12413f947f9388e4cac22c4621ce/sortedcontainers-2.4.0-py2.py3-none-any.whl", hash = "sha256:a163dcaede0f1c021485e957a39245190e74249897e2ae4b2aa38595db237ee0", size = 29575 },
|
3563 |
]
|
3564 |
|
3565 |
+
[[package]]
|
3566 |
+
name = "soundfile"
|
3567 |
+
version = "0.13.1"
|
3568 |
+
source = { registry = "https://pypi.org/simple" }
|
3569 |
+
dependencies = [
|
3570 |
+
{ name = "cffi" },
|
3571 |
+
{ name = "numpy" },
|
3572 |
+
]
|
3573 |
+
sdist = { url = "https://files.pythonhosted.org/packages/e1/41/9b873a8c055582859b239be17902a85339bec6a30ad162f98c9b0288a2cc/soundfile-0.13.1.tar.gz", hash = "sha256:b2c68dab1e30297317080a5b43df57e302584c49e2942defdde0acccc53f0e5b", size = 46156 }
|
3574 |
+
wheels = [
|
3575 |
+
{ url = "https://files.pythonhosted.org/packages/64/28/e2a36573ccbcf3d57c00626a21fe51989380636e821b341d36ccca0c1c3a/soundfile-0.13.1-py2.py3-none-any.whl", hash = "sha256:a23c717560da2cf4c7b5ae1142514e0fd82d6bbd9dfc93a50423447142f2c445", size = 25751 },
|
3576 |
+
{ url = "https://files.pythonhosted.org/packages/ea/ab/73e97a5b3cc46bba7ff8650a1504348fa1863a6f9d57d7001c6b67c5f20e/soundfile-0.13.1-py2.py3-none-macosx_10_9_x86_64.whl", hash = "sha256:82dc664d19831933fe59adad199bf3945ad06d84bc111a5b4c0d3089a5b9ec33", size = 1142250 },
|
3577 |
+
{ url = "https://files.pythonhosted.org/packages/a0/e5/58fd1a8d7b26fc113af244f966ee3aecf03cb9293cb935daaddc1e455e18/soundfile-0.13.1-py2.py3-none-macosx_11_0_arm64.whl", hash = "sha256:743f12c12c4054921e15736c6be09ac26b3b3d603aef6fd69f9dde68748f2593", size = 1101406 },
|
3578 |
+
{ url = "https://files.pythonhosted.org/packages/58/ae/c0e4a53d77cf6e9a04179535766b3321b0b9ced5f70522e4caf9329f0046/soundfile-0.13.1-py2.py3-none-manylinux_2_28_aarch64.whl", hash = "sha256:9c9e855f5a4d06ce4213f31918653ab7de0c5a8d8107cd2427e44b42df547deb", size = 1235729 },
|
3579 |
+
{ url = "https://files.pythonhosted.org/packages/57/5e/70bdd9579b35003a489fc850b5047beeda26328053ebadc1fb60f320f7db/soundfile-0.13.1-py2.py3-none-manylinux_2_28_x86_64.whl", hash = "sha256:03267c4e493315294834a0870f31dbb3b28a95561b80b134f0bd3cf2d5f0e618", size = 1313646 },
|
3580 |
+
{ url = "https://files.pythonhosted.org/packages/fe/df/8c11dc4dfceda14e3003bb81a0d0edcaaf0796dd7b4f826ea3e532146bba/soundfile-0.13.1-py2.py3-none-win32.whl", hash = "sha256:c734564fab7c5ddf8e9be5bf70bab68042cd17e9c214c06e365e20d64f9a69d5", size = 899881 },
|
3581 |
+
{ url = "https://files.pythonhosted.org/packages/14/e9/6b761de83277f2f02ded7e7ea6f07828ec78e4b229b80e4ca55dd205b9dc/soundfile-0.13.1-py2.py3-none-win_amd64.whl", hash = "sha256:1e70a05a0626524a69e9f0f4dd2ec174b4e9567f4d8b6c11d38b5c289be36ee9", size = 1019162 },
|
3582 |
+
]
|
3583 |
+
|
3584 |
[[package]]
|
3585 |
name = "soupsieve"
|
3586 |
version = "2.7"
|
|
|
3677 |
{ url = "https://files.pythonhosted.org/packages/1d/eb/cb8b01f5edf8f135eb3d0553d159db113a35b2948d0e51eeb735e7ae09ea/statsmodels-0.14.4-cp313-cp313-win_amd64.whl", hash = "sha256:81030108d27aecc7995cac05aa280cf8c6025f6a6119894eef648997936c2dd0", size = 9817574 },
|
3678 |
]
|
3679 |
|
3680 |
+
[[package]]
|
3681 |
+
name = "stockfish"
|
3682 |
+
version = "3.28.0"
|
3683 |
+
source = { registry = "https://pypi.org/simple" }
|
3684 |
+
sdist = { url = "https://files.pythonhosted.org/packages/dc/bd/b06883b957530867179da3672fa3d8b87a08a1e3c11ca9737d94b60689d2/stockfish-3.28.0.tar.gz", hash = "sha256:8764127c00434aa85b7fca1c064800e7ea907bb3626ccd4b583f0e911b014070", size = 16264 }
|
3685 |
+
wheels = [
|
3686 |
+
{ url = "https://files.pythonhosted.org/packages/95/b2/b3a204dd7754685fef5e62bf19b82d2572b321df619775502326cf7d383e/stockfish-3.28.0-py3-none-any.whl", hash = "sha256:e432e57112448fe1271dff402db3a4747bf00d1721815ce0a36cfa26582cf360", size = 13907 },
|
3687 |
+
]
|
3688 |
+
|
3689 |
[[package]]
|
3690 |
name = "striprtf"
|
3691 |
version = "0.0.26"
|
|
|
3850 |
{ url = "https://files.pythonhosted.org/packages/90/48/7e6477cf40d48cc0a61fa0d41ee9582b9a316b12772fcac17bc1a40178e7/torch-2.7.0-cp313-none-macosx_11_0_arm64.whl", hash = "sha256:27f5007bdf45f7bb7af7f11d1828d5c2487e030690afb3d89a651fd7036a390e", size = 68575074 },
|
3851 |
]
|
3852 |
|
3853 |
+
[[package]]
|
3854 |
+
name = "torchaudio"
|
3855 |
+
version = "2.7.0"
|
3856 |
+
source = { registry = "https://pypi.org/simple" }
|
3857 |
+
dependencies = [
|
3858 |
+
{ name = "torch" },
|
3859 |
+
]
|
3860 |
+
wheels = [
|
3861 |
+
{ url = "https://files.pythonhosted.org/packages/6e/d6/27deb8862ecc005c95a5c64bcc8cc27c74878eb8d4162ce4d39b35ea9e27/torchaudio-2.7.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:862d9c5cfe15688a7846962b5d3c9f959beffe82b1e5441935c7a37504c5c5e7", size = 1849075 },
|
3862 |
+
{ url = "https://files.pythonhosted.org/packages/04/95/29b4a4d87540779101cb60cb7f381fdb6bc6aea0af83f0f35aa8fc70cb0d/torchaudio-2.7.0-cp311-cp311-manylinux_2_28_aarch64.whl", hash = "sha256:677bd32031310ee73a47d6eebc2e74e74c1cf467932945ee88082a3935b5c950", size = 1686165 },
|
3863 |
+
{ url = "https://files.pythonhosted.org/packages/ab/20/1873a49df9f1778c241543eaca14d613d657b9f9351c254952114251cb86/torchaudio-2.7.0-cp311-cp311-manylinux_2_28_x86_64.whl", hash = "sha256:c37b77dd528ad18a036466e856f53d8bd5912b757a775309354b4a977a069379", size = 3455781 },
|
3864 |
+
{ url = "https://files.pythonhosted.org/packages/9e/1d/1fa4f69e4cd8c83831c3baad0ac9b56ece8ce0e75e5e5c0cdd3f591a458c/torchaudio-2.7.0-cp311-cp311-win_amd64.whl", hash = "sha256:36b94819f5406b2599ac31542e2e7a7aaf4a5b5f466ce034f296b1ee1134c945", size = 2494793 },
|
3865 |
+
{ url = "https://files.pythonhosted.org/packages/dd/b9/66dd7c4e16e8e6dcc52b4702ba7bbace589972b3597627d39d9dc3aa5fdd/torchaudio-2.7.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:65b4fc9b7f28367f918b02ae4db4290457bc4fdd160f22b7d684e93ab8dcb956", size = 1846733 },
|
3866 |
+
{ url = "https://files.pythonhosted.org/packages/47/48/850edf788c674494a7e148eee6f5563cae34c9a3e3e0962dcfce66c1dae7/torchaudio-2.7.0-cp312-cp312-manylinux_2_28_aarch64.whl", hash = "sha256:33004ed47f18f00044c97ee8cd9e3f5e1c2e26ef23d4f72b5f1ae33e6182587b", size = 1686687 },
|
3867 |
+
{ url = "https://files.pythonhosted.org/packages/78/98/ec8c7aba67b44cdc59717d4b43d02023ded5da180d33c6469d20bf5bfa3c/torchaudio-2.7.0-cp312-cp312-manylinux_2_28_x86_64.whl", hash = "sha256:a6f03494075bcdd62e7fade7baf50a0ef107aa809d02b5e1786391adced451a3", size = 3454437 },
|
3868 |
+
{ url = "https://files.pythonhosted.org/packages/5e/23/b73163ac06e5a724375df61a5b6c853861a825fe98e64388f277514153dd/torchaudio-2.7.0-cp312-cp312-win_amd64.whl", hash = "sha256:275931c8a38ff84b5692df990506b41f18d0a0706574d96bc8456ad9e5fa85c8", size = 2493451 },
|
3869 |
+
{ url = "https://files.pythonhosted.org/packages/c1/a5/bc4bb6b254d3d77e9fa4d219f29d3bff8db92acc9004c27e875f32d4724a/torchaudio-2.7.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:150fbde41da60296effed772b7a170f563cd44967555abb0603fc573f39ce245", size = 1847033 },
|
3870 |
+
{ url = "https://files.pythonhosted.org/packages/96/af/4c8d4e781ea5924590cccf8595a09081eb07a577c03fbf4bf04a2f5f7134/torchaudio-2.7.0-cp313-cp313-manylinux_2_28_aarch64.whl", hash = "sha256:9d921eeb036512a87efde007977b27bd326320cd7cd5f43195824173fe82e888", size = 1686308 },
|
3871 |
+
{ url = "https://files.pythonhosted.org/packages/12/02/ad1083f6ce534989c704c3efcd615bdd160934229882aa0a3ea95cd24a9a/torchaudio-2.7.0-cp313-cp313-manylinux_2_28_x86_64.whl", hash = "sha256:30675a5f99551e036974a7476729eb5d31f453cf792ae6e0a0d449960f84f464", size = 3455266 },
|
3872 |
+
{ url = "https://files.pythonhosted.org/packages/88/49/923ebb2603156dd5c5ae6d845bf51a078e05f27432cd26f13ecdcc8713cd/torchaudio-2.7.0-cp313-cp313-win_amd64.whl", hash = "sha256:ce8cfc07a4e59c835404583e7d3e171208b332b61bb92643f8723f6f192da8bf", size = 2493639 },
|
3873 |
+
{ url = "https://files.pythonhosted.org/packages/bf/85/dd4cd1202483e85c208e1ca3d31cc42c2972f1d955d11b742fa098a38a1b/torchaudio-2.7.0-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:9e08138cac75cde2064c8b5bbd12f27bdeb3d36f4b8c2285fc9c42eaa97c0676", size = 1929989 },
|
3874 |
+
{ url = "https://files.pythonhosted.org/packages/ef/3a/8a1045f2b00c6300827c1e6a3e661e9d219b5406ef103dc2824604548b8c/torchaudio-2.7.0-cp313-cp313t-manylinux_2_28_aarch64.whl", hash = "sha256:1d928aeff495a0807b4da3b0dd46e15eae8070da5e7ed6d35c1dcfd9fdfe2b74", size = 1700439 },
|
3875 |
+
{ url = "https://files.pythonhosted.org/packages/72/53/21d589a5a41702b5d37bae224286986cb707500d5ecdbfdcfdbac9381a08/torchaudio-2.7.0-cp313-cp313t-manylinux_2_28_x86_64.whl", hash = "sha256:ee4add33f24e9cb959bd9de89f36de5ebf844eda040d1d0b38f08617d67dedc3", size = 3466356 },
|
3876 |
+
{ url = "https://files.pythonhosted.org/packages/00/0b/5ef81aaacce5e9c316659ddc61a2b1e4f984a504d4a06fe61bab04cc75f1/torchaudio-2.7.0-cp313-cp313t-win_amd64.whl", hash = "sha256:725dbbcc9e744ca62de8856262c6f472ca26b1cd5db062b062a2d6b66a336cc0", size = 2544970 },
|
3877 |
+
]
|
3878 |
+
|
3879 |
[[package]]
|
3880 |
name = "tqdm"
|
3881 |
version = "4.67.1"
|