Spaces:
Running
Running
""" | |
Audio Content Analyzer for GAIA Agent | |
Provides intelligent content parsing and analysis from audio transcriptions. | |
Specialized for GAIA evaluation tasks including recipe analysis and educational content. | |
""" | |
import logging | |
import re | |
from typing import Dict, Any, List, Optional, Tuple | |
import json | |
try: | |
from .base_tool import SimpleAGNOTool | |
except ImportError: | |
from base_tool import SimpleAGNOTool | |
logger = logging.getLogger(__name__) | |
class AudioContentAnalyzer(SimpleAGNOTool): | |
""" | |
Intelligent audio content analyzer for GAIA evaluation tasks. | |
Specializes in: | |
- Recipe ingredient extraction from audio | |
- Educational content analysis (homework, page numbers) | |
- Structured data extraction from transcriptions | |
- Context-aware content understanding | |
- High-confidence information extraction | |
""" | |
def __init__(self): | |
"""Initialize the audio content analyzer.""" | |
super().__init__( | |
name="audio_content_analyzer", | |
description="Analyze audio transcriptions for structured content extraction and understanding" | |
) | |
# Set availability status | |
self.available = True | |
# Recipe analysis patterns | |
self.ingredient_patterns = [ | |
# Pattern: "2 cups of flour" | |
r'(\d+(?:\.\d+)?)\s+(cups?|cup|tablespoons?|tablespoon|tbsp|teaspoons?|teaspoon|tsp|pounds?|pound|lbs?|lb|ounces?|ounce|oz|grams?|gram|g)\s+(?:of\s+)?([a-zA-Z\s]+?)(?=\s*[,.\n]|$)', | |
# Pattern: "flour, 2 cups" | |
r'([a-zA-Z\s]+?),?\s*(\d+(?:\.\d+)?)\s+(cups?|cup|tablespoons?|tablespoon|tbsp|teaspoons?|teaspoon|tsp|pounds?|pound|lbs?|lb|ounces?|ounce|oz|grams?|gram|g)', | |
# Pattern: "add flour" | |
r'(?:add|use|mix|combine|include)\s+([a-zA-Z\s]+?)(?=\s*[,.\n]|$)', | |
] | |
# Common ingredients for validation | |
self.common_ingredients = { | |
'flour', 'sugar', 'butter', 'eggs', 'egg', 'milk', 'cream', 'vanilla', | |
'strawberries', 'strawberry', 'berries', 'berry', 'fruit', 'salt', | |
'baking powder', 'baking soda', 'powder', 'soda', 'cinnamon', 'nutmeg', | |
'lemon', 'orange', 'chocolate', 'nuts', 'almonds', 'pecans', 'walnuts', | |
'honey', 'syrup', 'oil', 'shortening', 'cornstarch', 'gelatin', | |
'water', 'juice', 'zest', 'extract', 'spice', 'spices' | |
} | |
# Educational content patterns | |
self.education_patterns = { | |
'page_numbers': [ | |
r'page\s+(\d+)', | |
r'on\s+page\s+(\d+)', | |
r'turn\s+to\s+page\s+(\d+)', | |
r'go\s+to\s+page\s+(\d+)', | |
r'see\s+page\s+(\d+)', | |
r'page\s+number\s+(\d+)' | |
], | |
'chapter_numbers': [ | |
r'chapter\s+(\d+)', | |
r'unit\s+(\d+)', | |
r'section\s+(\d+)' | |
], | |
'exercise_numbers': [ | |
r'exercise\s+(\d+)', | |
r'problem\s+(\d+)', | |
r'question\s+(\d+)', | |
r'assignment\s+(\d+)' | |
] | |
} | |
def analyze_recipe_content(self, transcription: str) -> Dict[str, Any]: | |
""" | |
Analyze transcription for recipe content and extract ingredients. | |
Args: | |
transcription: Audio transcription text | |
Returns: | |
Dictionary with recipe analysis results | |
""" | |
try: | |
logger.info("π° Analyzing recipe content from transcription") | |
analysis = { | |
'is_recipe': False, | |
'confidence': 0.0, | |
'ingredients': [], | |
'quantities': [], | |
'cooking_methods': [], | |
'recipe_type': None, | |
'structured_ingredients': [] | |
} | |
text_lower = transcription.lower() | |
# Check if this is likely a recipe | |
recipe_indicators = [ | |
'recipe', 'ingredients', 'cooking', 'baking', 'pie', 'cake', | |
'mix', 'stir', 'add', 'combine', 'bake', 'cook', 'prepare' | |
] | |
recipe_score = sum(1 for indicator in recipe_indicators if indicator in text_lower) | |
analysis['is_recipe'] = recipe_score >= 2 | |
analysis['confidence'] = min(1.0, recipe_score / 5.0) | |
if not analysis['is_recipe']: | |
logger.info("π Content does not appear to be a recipe") | |
return analysis | |
# Determine recipe type | |
if 'pie' in text_lower: | |
analysis['recipe_type'] = 'pie' | |
elif 'cake' in text_lower: | |
analysis['recipe_type'] = 'cake' | |
elif 'cookie' in text_lower: | |
analysis['recipe_type'] = 'cookies' | |
elif 'bread' in text_lower: | |
analysis['recipe_type'] = 'bread' | |
# Extract ingredients using multiple patterns | |
ingredients_found = set() | |
structured_ingredients = [] | |
for pattern in self.ingredient_patterns: | |
matches = re.findall(pattern, transcription, re.IGNORECASE) | |
for match in matches: | |
# Handle different match tuple lengths | |
if isinstance(match, tuple): | |
if len(match) == 3: # quantity, unit, ingredient | |
quantity, unit, ingredient = match | |
ingredient = ingredient.strip().lower() | |
# Validate ingredient | |
if self._is_valid_ingredient(ingredient): | |
ingredients_found.add(ingredient) | |
structured_ingredients.append({ | |
'ingredient': ingredient, | |
'quantity': quantity, | |
'unit': unit.lower() | |
}) | |
elif len(match) == 1: # just ingredient | |
ingredient = match[0].strip().lower() | |
if self._is_valid_ingredient(ingredient): | |
ingredients_found.add(ingredient) | |
structured_ingredients.append({ | |
'ingredient': ingredient, | |
'quantity': None, | |
'unit': None | |
}) | |
else: | |
# Single string match | |
ingredient = str(match).strip().lower() | |
if self._is_valid_ingredient(ingredient): | |
ingredients_found.add(ingredient) | |
structured_ingredients.append({ | |
'ingredient': ingredient, | |
'quantity': None, | |
'unit': None | |
}) | |
# Additional ingredient extraction for common items | |
for ingredient in self.common_ingredients: | |
if ingredient in text_lower and ingredient not in ingredients_found: | |
ingredients_found.add(ingredient) | |
structured_ingredients.append({ | |
'ingredient': ingredient, | |
'quantity': None, | |
'unit': None | |
}) | |
analysis['ingredients'] = list(ingredients_found) | |
analysis['structured_ingredients'] = structured_ingredients | |
# Extract cooking methods | |
cooking_methods = [ | |
'bake', 'mix', 'stir', 'whip', 'fold', 'beat', 'combine', | |
'add', 'pour', 'melt', 'heat', 'cool', 'chill', 'freeze' | |
] | |
for method in cooking_methods: | |
if method in text_lower: | |
analysis['cooking_methods'].append(method) | |
# Extract quantities and measurements | |
quantity_patterns = [ | |
r'(\d+(?:\.\d+)?)\s*(cups?|tablespoons?|teaspoons?|pounds?|ounces?)', | |
r'(\d+)\s*(degrees?)', | |
r'(\d+)\s*(minutes?)', | |
r'(\d+)\s*(hours?)' | |
] | |
for pattern in quantity_patterns: | |
matches = re.findall(pattern, text_lower) | |
for match in matches: | |
if isinstance(match, tuple) and len(match) == 2: | |
q, u = match | |
analysis['quantities'].append(f"{q} {u}") | |
elif isinstance(match, str): | |
analysis['quantities'].append(match) | |
logger.info(f"β Recipe analysis completed: {len(analysis['ingredients'])} ingredients found") | |
return analysis | |
except Exception as e: | |
logger.error(f"β Recipe analysis failed: {e}") | |
return { | |
'is_recipe': False, | |
'confidence': 0.0, | |
'ingredients': [], | |
'error': str(e) | |
} | |
def analyze_educational_content(self, transcription: str) -> Dict[str, Any]: | |
""" | |
Analyze transcription for educational content and extract key information. | |
Args: | |
transcription: Audio transcription text | |
Returns: | |
Dictionary with educational analysis results | |
""" | |
try: | |
logger.info("π Analyzing educational content from transcription") | |
analysis = { | |
'is_educational': False, | |
'confidence': 0.0, | |
'page_numbers': [], | |
'chapter_numbers': [], | |
'exercise_numbers': [], | |
'subjects': [], | |
'assignments': [], | |
'key_numbers': [] | |
} | |
text_lower = transcription.lower() | |
# Check if this is educational content | |
education_indicators = [ | |
'homework', 'assignment', 'page', 'chapter', 'exercise', | |
'problem', 'study', 'lesson', 'class', 'school', 'teacher', | |
'student', 'book', 'textbook', 'worksheet' | |
] | |
education_score = sum(1 for indicator in education_indicators if indicator in text_lower) | |
analysis['is_educational'] = education_score >= 2 | |
analysis['confidence'] = min(1.0, education_score / 5.0) | |
if not analysis['is_educational']: | |
logger.info("π Content does not appear to be educational") | |
return analysis | |
# Extract page numbers with high precision | |
for pattern in self.education_patterns['page_numbers']: | |
matches = re.findall(pattern, text_lower) | |
analysis['page_numbers'].extend(matches) | |
# Remove duplicates and sort | |
analysis['page_numbers'] = sorted(list(set(analysis['page_numbers'])), key=int) | |
# Extract chapter numbers | |
for pattern in self.education_patterns['chapter_numbers']: | |
matches = re.findall(pattern, text_lower) | |
analysis['chapter_numbers'].extend(matches) | |
# Extract exercise/problem numbers | |
for pattern in self.education_patterns['exercise_numbers']: | |
matches = re.findall(pattern, text_lower) | |
analysis['exercise_numbers'].extend(matches) | |
# Identify subjects | |
subjects = { | |
'math': ['math', 'mathematics', 'algebra', 'geometry', 'calculus', 'arithmetic'], | |
'science': ['science', 'physics', 'chemistry', 'biology', 'astronomy'], | |
'english': ['english', 'literature', 'reading', 'writing', 'grammar'], | |
'history': ['history', 'social studies', 'geography', 'civics'], | |
'language': ['spanish', 'french', 'german', 'italian', 'chinese', 'japanese'] | |
} | |
for subject, keywords in subjects.items(): | |
if any(keyword in text_lower for keyword in keywords): | |
analysis['subjects'].append(subject) | |
# Extract all numbers for potential reference | |
all_numbers = re.findall(r'\b\d+\b', transcription) | |
analysis['key_numbers'] = list(set(all_numbers)) | |
logger.info(f"β Educational analysis completed: {len(analysis['page_numbers'])} page numbers found") | |
return analysis | |
except Exception as e: | |
logger.error(f"β Educational analysis failed: {e}") | |
return { | |
'is_educational': False, | |
'confidence': 0.0, | |
'page_numbers': [], | |
'error': str(e) | |
} | |
def extract_key_information(self, transcription: str, target_type: str) -> Dict[str, Any]: | |
""" | |
Extract specific key information from transcription based on target type. | |
Args: | |
transcription: Audio transcription text | |
target_type: Type of information to extract ('recipe_ingredients', 'page_numbers', 'all') | |
Returns: | |
Dictionary with extracted information | |
""" | |
try: | |
logger.info(f"π Extracting key information: {target_type}") | |
result = { | |
'target_type': target_type, | |
'success': True, | |
'extracted_data': {}, | |
'confidence': 0.0 | |
} | |
if target_type == 'recipe_ingredients' or target_type == 'all': | |
recipe_analysis = self.analyze_recipe_content(transcription) | |
result['extracted_data']['recipe'] = recipe_analysis | |
if recipe_analysis['is_recipe']: | |
result['confidence'] = max(result['confidence'], recipe_analysis['confidence']) | |
if target_type == 'page_numbers' or target_type == 'all': | |
education_analysis = self.analyze_educational_content(transcription) | |
result['extracted_data']['education'] = education_analysis | |
if education_analysis['is_educational']: | |
result['confidence'] = max(result['confidence'], education_analysis['confidence']) | |
# Extract the most relevant information based on target type | |
if target_type == 'recipe_ingredients': | |
if 'recipe' in result['extracted_data'] and result['extracted_data']['recipe']['is_recipe']: | |
result['primary_result'] = result['extracted_data']['recipe']['ingredients'] | |
else: | |
result['primary_result'] = [] | |
elif target_type == 'page_numbers': | |
if 'education' in result['extracted_data'] and result['extracted_data']['education']['is_educational']: | |
result['primary_result'] = result['extracted_data']['education']['page_numbers'] | |
else: | |
result['primary_result'] = [] | |
else: # 'all' | |
result['primary_result'] = { | |
'recipe_ingredients': result['extracted_data'].get('recipe', {}).get('ingredients', []), | |
'page_numbers': result['extracted_data'].get('education', {}).get('page_numbers', []) | |
} | |
logger.info(f"β Key information extraction completed with confidence: {result['confidence']:.2f}") | |
return result | |
except Exception as e: | |
logger.error(f"β Key information extraction failed: {e}") | |
return { | |
'target_type': target_type, | |
'success': False, | |
'error': str(e), | |
'extracted_data': {}, | |
'confidence': 0.0 | |
} | |
def _is_valid_ingredient(self, ingredient: str) -> bool: | |
"""Check if a string is likely a valid ingredient.""" | |
ingredient = ingredient.strip().lower() | |
# Must be at least 2 characters | |
if len(ingredient) < 2: | |
return False | |
# Check against common ingredients | |
if ingredient in self.common_ingredients: | |
return True | |
# Check if it contains common ingredient words | |
ingredient_words = ingredient.split() | |
for word in ingredient_words: | |
if word in self.common_ingredients: | |
return True | |
# Check for food-related patterns | |
food_patterns = [ | |
r'.*flour$', r'.*sugar$', r'.*powder$', r'.*extract$', | |
r'.*juice$', r'.*zest$', r'.*oil$', r'.*sauce$' | |
] | |
for pattern in food_patterns: | |
if re.match(pattern, ingredient): | |
return True | |
# Exclude common non-ingredients | |
non_ingredients = [ | |
'minutes', 'degrees', 'hours', 'time', 'temperature', | |
'oven', 'bowl', 'pan', 'spoon', 'cup', 'tablespoon' | |
] | |
if ingredient in non_ingredients: | |
return False | |
# If it's a reasonable length and contains letters, consider it valid | |
if 2 <= len(ingredient) <= 30 and re.match(r'^[a-zA-Z\s]+$', ingredient): | |
return True | |
return False | |
def get_tool_functions(self) -> List[Dict[str, Any]]: | |
"""Get function definitions for AGNO integration.""" | |
return [ | |
{ | |
"name": "analyze_recipe_content", | |
"description": "Analyze audio transcription for recipe content and extract ingredients", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"transcription": { | |
"type": "string", | |
"description": "Audio transcription text to analyze for recipe content" | |
} | |
}, | |
"required": ["transcription"] | |
} | |
}, | |
{ | |
"name": "analyze_educational_content", | |
"description": "Analyze audio transcription for educational content and extract page numbers, assignments", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"transcription": { | |
"type": "string", | |
"description": "Audio transcription text to analyze for educational content" | |
} | |
}, | |
"required": ["transcription"] | |
} | |
}, | |
{ | |
"name": "extract_key_information", | |
"description": "Extract specific key information from audio transcription", | |
"parameters": { | |
"type": "object", | |
"properties": { | |
"transcription": { | |
"type": "string", | |
"description": "Audio transcription text to analyze" | |
}, | |
"target_type": { | |
"type": "string", | |
"description": "Type of information to extract", | |
"enum": ["recipe_ingredients", "page_numbers", "all"] | |
} | |
}, | |
"required": ["transcription", "target_type"] | |
} | |
} | |
] | |
# Create tool instance for AGNO integration | |
def create_audio_content_analyzer() -> Optional[AudioContentAnalyzer]: | |
"""Create and return audio content analyzer instance.""" | |
try: | |
tool = AudioContentAnalyzer() | |
logger.info("β Audio content analyzer created successfully") | |
return tool | |
except Exception as e: | |
logger.error(f"β Failed to create audio content analyzer: {e}") | |
return None |