Spaces:
Running
Running
File size: 5,745 Bytes
9a6a4dc |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 |
#!/usr/bin/env python3
"""
Debug Audio Processing Issue
This script reproduces the MP3 audio processing issue that causes
malformed responses with "[}]" and UUID artifacts in GAIA evaluation.
"""
import os
import sys
import logging
import tempfile
from pathlib import Path
# Add the deployment-ready directory to Python path
sys.path.insert(0, str(Path(__file__).parent))
from agents.fixed_enhanced_unified_agno_agent import FixedGAIAAgent
# Configure logging
logging.basicConfig(
level=logging.DEBUG,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
def create_test_mp3_file():
"""Create a minimal test MP3 file for debugging."""
# Create a minimal MP3 file (just headers, no actual audio)
mp3_header = b'\xff\xfb\x90\x00' + b'\x00' * 100 # Minimal MP3 header + padding
with tempfile.NamedTemporaryFile(suffix='.mp3', delete=False) as tmp:
tmp.write(mp3_header)
tmp.flush()
return tmp.name
def test_audio_processing_issue():
"""Test audio processing to identify the source of malformed responses."""
logger.info("π Starting audio processing debug test...")
# Create test MP3 file
test_mp3_path = create_test_mp3_file()
logger.info(f"π Created test MP3 file: {test_mp3_path}")
try:
# Initialize the agent
logger.info("π Initializing FixedGAIAAgent...")
agent = FixedGAIAAgent()
if not agent.available:
logger.error("β Agent not available - cannot test")
return
# Test question with MP3 file
test_question = "What is said in this audio file?"
test_files = [test_mp3_path]
logger.info(f"π€ Testing question: {test_question}")
logger.info(f"π With MP3 file: {test_mp3_path}")
# Process the question - this should trigger the audio processing
logger.info("π Processing question with MP3 file...")
result = agent(test_question, test_files)
logger.info(f"π Raw result: {repr(result)}")
logger.info(f"π― Final result: '{result}'")
# Check for malformed response patterns
if "[}]" in result:
logger.error("β FOUND '[}]' ARTIFACT in response!")
if any(char.isdigit() and char in "0123456789abcdef" for char in result.lower()):
# Simple check for potential UUID patterns
logger.warning("β οΈ Potential UUID-like patterns detected in response")
# Check if result looks like a tool call or JSON
if result.startswith('{') or '"name"' in result or '"arguments"' in result:
logger.error("β FOUND JSON/TOOL CALL ARTIFACT in response!")
return result
except Exception as e:
logger.error(f"β Error during audio processing test: {e}")
import traceback
logger.error(f"π Traceback: {traceback.format_exc()}")
return None
finally:
# Clean up test file
try:
os.unlink(test_mp3_path)
logger.info("π§Ή Cleaned up test MP3 file")
except Exception as e:
logger.warning(f"β οΈ Failed to clean up test file: {e}")
def test_multimodal_tools_directly():
"""Test the multimodal tools directly to isolate the issue."""
logger.info("π§ Testing multimodal tools directly...")
try:
from agents.mistral_multimodal_agent import OpenSourceMultimodalTools
# Initialize multimodal tools
multimodal = OpenSourceMultimodalTools()
# Create test MP3 file
test_mp3_path = create_test_mp3_file()
# Test audio transcription directly
logger.info("π΅ Testing audio transcription directly...")
transcription = multimodal.transcribe_audio(test_mp3_path)
logger.info(f"π Direct transcription result: {repr(transcription)}")
# Check for artifacts
if "[}]" in transcription:
logger.error("β FOUND '[}]' ARTIFACT in direct transcription!")
if transcription.startswith('{') or '"name"' in transcription:
logger.error("β FOUND JSON ARTIFACT in direct transcription!")
# Clean up
os.unlink(test_mp3_path)
return transcription
except Exception as e:
logger.error(f"β Error testing multimodal tools directly: {e}")
import traceback
logger.error(f"π Traceback: {traceback.format_exc()}")
return None
def main():
"""Main debug function."""
logger.info("π GAIA Audio Processing Debug Tool")
logger.info("=" * 50)
# Test 1: Direct multimodal tools test
logger.info("\nπ§ TEST 1: Direct Multimodal Tools Test")
logger.info("-" * 40)
direct_result = test_multimodal_tools_directly()
# Test 2: Full agent test
logger.info("\nπ€ TEST 2: Full Agent Test")
logger.info("-" * 40)
agent_result = test_audio_processing_issue()
# Summary
logger.info("\nπ DEBUG SUMMARY")
logger.info("=" * 50)
logger.info(f"Direct multimodal result: {repr(direct_result)}")
logger.info(f"Full agent result: {repr(agent_result)}")
# Analysis
if direct_result and "[}]" in direct_result:
logger.error("π¨ ISSUE FOUND: '[}]' artifacts in direct multimodal tools")
elif agent_result and "[}]" in agent_result:
logger.error("π¨ ISSUE FOUND: '[}]' artifacts in agent processing pipeline")
else:
logger.info("β
No '[}]' artifacts detected in this test")
if __name__ == "__main__":
main() |