Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

gaia-enhanced-agent / test_phase1_improvements.py

GAIA Agent Deployment

Deploy Complete Enhanced GAIA Agent with Phase 1-6 Improvements

9a6a4dc about 2 months ago

10.4 kB

	#!/usr/bin/env python3
	"""
	Test Phase 1 Improvements - Tool Execution and Answer Formatting

	This script tests the critical fixes implemented in Phase 1:
	1. Tool execution debugging and validation
	2. Enhanced answer formatting with multiple patterns
	3. GAIA format compliance validation
	4. Comprehensive error handling and fallback systems

	Usage:
	python test_phase1_improvements.py
	"""

	import os
	import sys
	import logging
	from pathlib import Path

	# Add the deployment-ready directory to the path
	sys.path.insert(0, str(Path(__file__).parent))

	# Configure logging
	logging.basicConfig(
	level=logging.INFO,
	format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
	)
	logger = logging.getLogger(__name__)

	def test_tool_execution_debugger():
	"""Test the ToolExecutionDebugger functionality."""
	logger.info("🔧 Testing ToolExecutionDebugger...")

	try:
	from utils.tool_execution_debugger import ToolExecutionDebugger

	debugger = ToolExecutionDebugger()

	# Test JSON syntax detection
	test_responses = [
	"The answer is 42", # Normal response
	'{"function": "calculator", "parameters": {"expression": "2+2"}}', # JSON syntax issue
	"FINAL ANSWER: 42", # Proper format
	"I need to use the calculator tool: {\"tool\": \"calc\"}", # Mixed content
	]

	for i, response in enumerate(test_responses):
	issues = debugger.detect_json_syntax_in_response(response)
	logger.info(f" Test {i+1}: {'❌ Issues detected' if issues else '✅ Clean'} - {issues}")

	# Test tool validation
	class MockTool:
	def __init__(self, name):
	self.name = name

	def __class__(self):
	return type(self.name, (), {})

	mock_tool = MockTool("TestTool")
	validation = debugger.validate_tool_registration("TestTool", mock_tool)
	logger.info(f" Tool validation: {validation}")

	# Get debug stats
	stats = debugger.get_debug_stats()
	logger.info(f" Debug stats: {stats}")

	logger.info("✅ ToolExecutionDebugger tests passed")
	return True

	except Exception as e:
	logger.error(f"❌ ToolExecutionDebugger test failed: {e}")
	return False

	def test_enhanced_answer_formatter():
	"""Test the EnhancedGAIAAnswerFormatter functionality."""
	logger.info("🎯 Testing EnhancedGAIAAnswerFormatter...")

	try:
	from utils.enhanced_gaia_answer_formatter import EnhancedGAIAAnswerFormatter

	formatter = EnhancedGAIAAnswerFormatter()

	# Test cases covering different answer types and formats
	test_cases = [
	# Number formatting
	{
	'input': "The calculation gives us 1,234.50 as the result.",
	'question': "What is 1000 + 234.5?",
	'expected_type': 'number',
	'description': 'Number with comma removal'
	},
	{
	'input': "FINAL ANSWER: 42",
	'question': "How many items are there?",
	'expected_type': 'number',
	'description': 'Simple FINAL ANSWER format'
	},

	# String formatting
	{
	'input': "The capital of France is Paris.",
	'question': "What is the capital of France?",
	'expected_type': 'string',
	'description': 'String extraction from sentence'
	},
	{
	'input': 'FINAL ANSWER: "The Eiffel Tower"',
	'question': "What is the famous tower in Paris?",
	'expected_type': 'string',
	'description': 'String with quotes removal'
	},

	# List formatting
	{
	'input': "The colors are red, blue, and green.",
	'question': "List three primary colors",
	'expected_type': 'list',
	'description': 'List with "and" removal'
	},
	{
	'input': "FINAL ANSWER: apple; banana; orange",
	'question': "Name three fruits",
	'expected_type': 'list',
	'description': 'List with semicolon separation'
	},

	# Boolean formatting
	{
	'input': "Yes, Paris is in France.",
	'question': "Is Paris in France?",
	'expected_type': 'boolean',
	'description': 'Boolean yes answer'
	},
	{
	'input': "No, that is incorrect.",
	'question': "Is London in Germany?",
	'expected_type': 'boolean',
	'description': 'Boolean no answer'
	},

	# Complex cases
	{
	'input': "After analyzing the data, I can conclude that the answer is 3.14159.",
	'question': "What is the value of pi to 5 decimal places?",
	'expected_type': 'number',
	'description': 'Number extraction from complex text'
	},
	{
	'input': "Let me search for this information... The result shows that Einstein was born in 1879.",
	'question': "When was Einstein born?",
	'expected_type': 'number',
	'description': 'Year extraction from narrative'
	}
	]

	results = []
	for i, test_case in enumerate(test_cases):
	try:
	formatted = formatter.format_answer(test_case['input'], test_case['question'])
	results.append({
	'test': i + 1,
	'description': test_case['description'],
	'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
	'output': formatted,
	'status': '✅ Success'
	})
	logger.info(f" Test {i+1}: ✅ {test_case['description']} → '{formatted}'")
	except Exception as e:
	results.append({
	'test': i + 1,
	'description': test_case['description'],
	'input': test_case['input'][:50] + "..." if len(test_case['input']) > 50 else test_case['input'],
	'output': f"Error: {e}",
	'status': '❌ Failed'
	})
	logger.error(f" Test {i+1}: ❌ {test_case['description']} failed: {e}")

	# Get formatting statistics
	stats = formatter.get_formatting_stats()
	logger.info(f" Formatting stats: {stats}")

	# Summary
	successful_tests = sum(1 for r in results if r['status'] == '✅ Success')
	logger.info(f"✅ Enhanced formatter tests: {successful_tests}/{len(test_cases)} passed")

	return successful_tests == len(test_cases)

	except Exception as e:
	logger.error(f"❌ EnhancedGAIAAnswerFormatter test failed: {e}")
	return False

	def test_agent_integration():
	"""Test the integration of improvements in the main agent."""
	logger.info("🤖 Testing agent integration...")

	try:
	# Check if MISTRAL_API_KEY is available
	if not os.getenv("MISTRAL_API_KEY"):
	logger.warning("⚠️ MISTRAL_API_KEY not found - skipping agent integration test")
	return True

	from agents.enhanced_unified_agno_agent import GAIAAgent

	# Initialize agent
	agent = GAIAAgent()

	if not agent.available:
	logger.warning("⚠️ Agent not available - check API key and dependencies")
	return False

	# Test tool status
	tool_status = agent.get_tool_status()
	logger.info(f" Tool status: {tool_status}")

	# Test simple question (if agent is available)
	test_question = "What is 2 + 2?"
	logger.info(f" Testing question: {test_question}")

	try:
	response = agent(test_question)
	logger.info(f" Response: {response}")

	# Check if response is properly formatted
	if response and response != "Agent not available" and response != "Unable to process this question":
	logger.info("✅ Agent integration test passed")
	return True
	else:
	logger.warning("⚠️ Agent returned error response")
	return False

	except Exception as e:
	logger.error(f"❌ Agent execution failed: {e}")
	return False

	except Exception as e:
	logger.error(f"❌ Agent integration test failed: {e}")
	return False

	def run_phase1_tests():
	"""Run all Phase 1 improvement tests."""
	logger.info("🚀 Starting Phase 1 Improvement Tests")
	logger.info("=" * 60)

	test_results = {}

	# Test 1: Tool Execution Debugger
	test_results['tool_debugger'] = test_tool_execution_debugger()

	# Test 2: Enhanced Answer Formatter
	test_results['answer_formatter'] = test_enhanced_answer_formatter()

	# Test 3: Agent Integration
	test_results['agent_integration'] = test_agent_integration()

	# Summary
	logger.info("=" * 60)
	logger.info("📊 Phase 1 Test Results Summary:")

	total_tests = len(test_results)
	passed_tests = sum(1 for result in test_results.values() if result)

	for test_name, result in test_results.items():
	status = "✅ PASSED" if result else "❌ FAILED"
	logger.info(f" {test_name}: {status}")

	logger.info(f"\nOverall: {passed_tests}/{total_tests} tests passed")

	if passed_tests == total_tests:
	logger.info("🎉 All Phase 1 improvements are working correctly!")
	logger.info("📈 Ready to proceed with Phase 2 (Answer Formatting Enhancement)")
	else:
	logger.warning("⚠️ Some tests failed - review logs and fix issues before proceeding")

	return passed_tests == total_tests

	if __name__ == "__main__":
	success = run_phase1_tests()
	sys.exit(0 if success else 1)