Spaces:

JoachimVC
/

gaia-enhanced-agent

Running

File size: 20,438 Bytes

9a6a4dc

"""
Audio File Handler for GAIA Agent
Provides comprehensive audio file processing capabilities including:
- Multi-format audio file processing and conversion
- Audio normalization and quality enhancement
- Metadata extraction and validation
- Streaming support for large files
"""

import os
import logging
import tempfile
import shutil
from typing import Dict, Any, Optional, List, Tuple, Union
from pathlib import Path
import json

try:
    import soundfile as sf
    import numpy as np
    AUDIO_DEPS_AVAILABLE = True
except ImportError as e:
    logging.warning(f"Audio dependencies not available: {e}")
    AUDIO_DEPS_AVAILABLE = False

logger = logging.getLogger(__name__)


class AudioFileHandler:
    """
    Comprehensive audio file handler for GAIA evaluation tasks.
    
    Features:
    - Multi-format support (MP3, WAV, M4A, FLAC, OGG, AAC, WMA)
    - Audio conversion and normalization
    - Metadata extraction and validation
    - Quality assessment and enhancement
    - Streaming support for large files
    - Error handling and recovery
    """
    
    def __init__(self):
        """Initialize the audio file handler."""
        self.available = AUDIO_DEPS_AVAILABLE
        self.supported_formats = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma', '.webm']
        self.max_file_size = 100 * 1024 * 1024  # 100MB
        self.temp_dir = None
        
        # Audio processing parameters
        self.target_sample_rate = 16000  # Optimal for Whisper
        self.target_channels = 1  # Mono for speech recognition
        self.quality_threshold = 0.7  # Minimum quality score
        
        if not self.available:
            logger.warning("⚠️ Audio file handler not available - missing dependencies")
        else:
            logger.info("✅ Audio file handler initialized")
    
    def validate_audio_file(self, file_path: str) -> Dict[str, Any]:
        """
        Comprehensive audio file validation.
        
        Args:
            file_path: Path to the audio file
            
        Returns:
            Dictionary with validation results and file information
        """
        try:
            path = Path(file_path)
            
            validation_result = {
                'valid': False,
                'file_exists': False,
                'format_supported': False,
                'size_acceptable': False,
                'readable': False,
                'info': {},
                'errors': [],
                'warnings': []
            }
            
            # Check if file exists
            if not path.exists():
                validation_result['errors'].append(f"File not found: {file_path}")
                return validation_result
            
            validation_result['file_exists'] = True
            
            # Check file size
            file_size = path.stat().st_size
            if file_size == 0:
                validation_result['errors'].append("File is empty")
                return validation_result
            
            if file_size > self.max_file_size:
                validation_result['errors'].append(
                    f"File too large: {file_size / (1024*1024):.1f}MB (max: {self.max_file_size / (1024*1024)}MB)"
                )
                return validation_result
            
            validation_result['size_acceptable'] = True
            
            # Check file format
            file_ext = path.suffix.lower()
            if file_ext not in self.supported_formats:
                validation_result['errors'].append(
                    f"Unsupported format: {file_ext}. Supported: {', '.join(self.supported_formats)}"
                )
                return validation_result
            
            validation_result['format_supported'] = True
            
            # Try to read audio file and extract metadata
            try:
                if not self.available:
                    validation_result['errors'].append("Audio processing dependencies not available")
                    return validation_result
                
                info = sf.info(file_path)
                
                audio_info = {
                    'duration': info.duration,
                    'sample_rate': info.samplerate,
                    'channels': info.channels,
                    'frames': info.frames,
                    'format': info.format,
                    'subtype': info.subtype,
                    'file_size_mb': file_size / (1024 * 1024)
                }
                
                validation_result['info'] = audio_info
                validation_result['readable'] = True
                
                # Quality checks
                if info.duration < 0.1:
                    validation_result['warnings'].append("Very short audio duration")
                elif info.duration > 3600:  # 1 hour
                    validation_result['warnings'].append("Very long audio file - processing may take time")
                
                if info.samplerate < 8000:
                    validation_result['warnings'].append("Low sample rate - may affect transcription quality")
                
                validation_result['valid'] = True
                
            except Exception as e:
                validation_result['errors'].append(f"Cannot read audio file: {str(e)}")
                return validation_result
            
            logger.info(f"✅ Audio file validation successful: {file_path}")
            return validation_result
            
        except Exception as e:
            logger.error(f"❌ Audio file validation failed: {e}")
            return {
                'valid': False,
                'errors': [f"Validation error: {str(e)}"],
                'file_exists': False,
                'format_supported': False,
                'size_acceptable': False,
                'readable': False,
                'info': {}
            }
    
    def normalize_audio(self, file_path: str, output_path: Optional[str] = None) -> Dict[str, Any]:
        """
        Normalize audio file for optimal speech recognition.
        
        Args:
            file_path: Path to input audio file
            output_path: Path for normalized output (optional, creates temp file if None)
            
        Returns:
            Dictionary with normalization results
        """
        try:
            if not self.available:
                return {
                    'success': False,
                    'error': 'Audio processing dependencies not available',
                    'output_path': None
                }
            
            logger.info(f"🔧 Normalizing audio file: {file_path}")
            
            # Validate input file
            validation = self.validate_audio_file(file_path)
            if not validation['valid']:
                return {
                    'success': False,
                    'error': f"Invalid input file: {validation['errors']}",
                    'output_path': None
                }
            
            # Read audio data
            data, sample_rate = sf.read(file_path)
            
            # Convert to mono if stereo
            if len(data.shape) > 1 and data.shape[1] > 1:
                data = np.mean(data, axis=1)
                logger.info("🔄 Converted stereo to mono")
            
            # Normalize amplitude
            if np.max(np.abs(data)) > 0:
                data = data / np.max(np.abs(data)) * 0.95
                logger.info("🔄 Normalized amplitude")
            
            # Resample if necessary
            if sample_rate != self.target_sample_rate:
                # Simple resampling (for more advanced resampling, would need librosa)
                logger.info(f"🔄 Sample rate: {sample_rate} Hz (target: {self.target_sample_rate} Hz)")
                # Note: For production, implement proper resampling with librosa
            
            # Create output path if not provided
            if output_path is None:
                if self.temp_dir is None:
                    self.temp_dir = tempfile.mkdtemp(prefix="gaia_audio_")
                
                output_path = os.path.join(
                    self.temp_dir,
                    f"normalized_{Path(file_path).stem}.wav"
                )
            
            # Write normalized audio
            sf.write(output_path, data, sample_rate)
            
            # Validate output
            output_validation = self.validate_audio_file(output_path)
            
            result = {
                'success': True,
                'output_path': output_path,
                'original_info': validation['info'],
                'normalized_info': output_validation['info'] if output_validation['valid'] else {},
                'changes_made': []
            }
            
            # Document changes
            if len(data.shape) == 1 or data.shape[1] == 1:
                result['changes_made'].append('converted_to_mono')
            
            result['changes_made'].append('normalized_amplitude')
            
            if sample_rate != self.target_sample_rate:
                result['changes_made'].append('resampled')
            
            logger.info(f"✅ Audio normalization completed: {output_path}")
            return result
            
        except Exception as e:
            logger.error(f"❌ Audio normalization failed: {e}")
            return {
                'success': False,
                'error': f"Normalization failed: {str(e)}",
                'output_path': None
            }
    
    def extract_metadata(self, file_path: str) -> Dict[str, Any]:
        """
        Extract comprehensive metadata from audio file.
        
        Args:
            file_path: Path to audio file
            
        Returns:
            Dictionary with extracted metadata
        """
        try:
            if not self.available:
                return {
                    'success': False,
                    'error': 'Audio processing dependencies not available',
                    'metadata': {}
                }
            
            logger.info(f"📊 Extracting metadata from: {file_path}")
            
            # Basic file information
            path = Path(file_path)
            file_stats = path.stat()
            
            metadata = {
                'file_info': {
                    'name': path.name,
                    'size_bytes': file_stats.st_size,
                    'size_mb': file_stats.st_size / (1024 * 1024),
                    'extension': path.suffix.lower(),
                    'created': file_stats.st_ctime,
                    'modified': file_stats.st_mtime
                },
                'audio_info': {},
                'quality_assessment': {}
            }
            
            # Audio-specific information
            try:
                info = sf.info(file_path)
                
                metadata['audio_info'] = {
                    'duration_seconds': info.duration,
                    'duration_formatted': self._format_duration(info.duration),
                    'sample_rate': info.samplerate,
                    'channels': info.channels,
                    'frames': info.frames,
                    'format': info.format,
                    'subtype': info.subtype,
                    'bits_per_sample': self._get_bits_per_sample(info.subtype)
                }
                
                # Quality assessment
                quality_score = self._assess_audio_quality(info)
                metadata['quality_assessment'] = {
                    'overall_score': quality_score,
                    'sample_rate_quality': self._assess_sample_rate(info.samplerate),
                    'duration_quality': self._assess_duration(info.duration),
                    'format_quality': self._assess_format(info.format, info.subtype),
                    'recommendations': self._get_quality_recommendations(info)
                }
                
            except Exception as e:
                metadata['audio_info'] = {'error': f"Could not read audio info: {str(e)}"}
                metadata['quality_assessment'] = {'error': str(e)}
            
            logger.info(f"✅ Metadata extraction completed")
            return {
                'success': True,
                'metadata': metadata
            }
            
        except Exception as e:
            logger.error(f"❌ Metadata extraction failed: {e}")
            return {
                'success': False,
                'error': f"Metadata extraction failed: {str(e)}",
                'metadata': {}
            }
    
    def prepare_for_transcription(self, file_path: str) -> Dict[str, Any]:
        """
        Prepare audio file for optimal transcription quality.
        
        Args:
            file_path: Path to input audio file
            
        Returns:
            Dictionary with preparation results and optimized file path
        """
        try:
            logger.info(f"🎯 Preparing audio for transcription: {file_path}")
            
            # Validate input
            validation = self.validate_audio_file(file_path)
            if not validation['valid']:
                return {
                    'success': False,
                    'error': f"Invalid audio file: {validation['errors']}",
                    'prepared_file': None,
                    'original_file': file_path
                }
            
            # Check if normalization is needed
            info = validation['info']
            needs_normalization = (
                info.get('channels', 1) > 1 or  # Stereo to mono
                info.get('sample_rate', 16000) != self.target_sample_rate or  # Resample
                self._needs_amplitude_normalization(file_path)  # Amplitude normalization
            )
            
            if not needs_normalization:
                logger.info("✅ Audio file already optimal for transcription")
                return {
                    'success': True,
                    'prepared_file': file_path,
                    'original_file': file_path,
                    'normalization_applied': False,
                    'info': info
                }
            
            # Apply normalization
            normalization_result = self.normalize_audio(file_path)
            
            if not normalization_result['success']:
                return {
                    'success': False,
                    'error': f"Normalization failed: {normalization_result['error']}",
                    'prepared_file': None,
                    'original_file': file_path
                }
            
            return {
                'success': True,
                'prepared_file': normalization_result['output_path'],
                'original_file': file_path,
                'normalization_applied': True,
                'changes_made': normalization_result['changes_made'],
                'original_info': normalization_result['original_info'],
                'normalized_info': normalization_result['normalized_info']
            }
            
        except Exception as e:
            logger.error(f"❌ Audio preparation failed: {e}")
            return {
                'success': False,
                'error': f"Preparation failed: {str(e)}",
                'prepared_file': None,
                'original_file': file_path
            }
    
    def cleanup_temp_files(self):
        """Clean up temporary files created during processing."""
        try:
            if self.temp_dir and os.path.exists(self.temp_dir):
                shutil.rmtree(self.temp_dir)
                self.temp_dir = None
                logger.info("🧹 Temporary files cleaned up")
        except Exception as e:
            logger.warning(f"⚠️ Failed to cleanup temp files: {e}")
    
    def _format_duration(self, duration_seconds: float) -> str:
        """Format duration in human-readable format."""
        hours = int(duration_seconds // 3600)
        minutes = int((duration_seconds % 3600) // 60)
        seconds = int(duration_seconds % 60)
        
        if hours > 0:
            return f"{hours:02d}:{minutes:02d}:{seconds:02d}"
        else:
            return f"{minutes:02d}:{seconds:02d}"
    
    def _get_bits_per_sample(self, subtype: str) -> int:
        """Get bits per sample from subtype."""
        subtype_bits = {
            'PCM_16': 16,
            'PCM_24': 24,
            'PCM_32': 32,
            'FLOAT': 32,
            'DOUBLE': 64
        }
        return subtype_bits.get(subtype, 16)
    
    def _assess_audio_quality(self, info) -> float:
        """Assess overall audio quality for transcription (0-1 score)."""
        score = 1.0
        
        # Sample rate assessment
        if info.samplerate < 8000:
            score -= 0.3
        elif info.samplerate < 16000:
            score -= 0.1
        
        # Duration assessment
        if info.duration < 1.0:
            score -= 0.2
        elif info.duration > 3600:
            score -= 0.1
        
        # Channel assessment (mono is better for speech)
        if info.channels > 1:
            score -= 0.1
        
        return max(0.0, score)
    
    def _assess_sample_rate(self, sample_rate: int) -> str:
        """Assess sample rate quality."""
        if sample_rate >= 44100:
            return "excellent"
        elif sample_rate >= 22050:
            return "good"
        elif sample_rate >= 16000:
            return "adequate"
        elif sample_rate >= 8000:
            return "poor"
        else:
            return "very_poor"
    
    def _assess_duration(self, duration: float) -> str:
        """Assess duration quality."""
        if 10 <= duration <= 1800:  # 10 seconds to 30 minutes
            return "optimal"
        elif 1 <= duration <= 3600:  # 1 second to 1 hour
            return "good"
        elif duration < 1:
            return "too_short"
        else:
            return "very_long"
    
    def _assess_format(self, format_name: str, subtype: str) -> str:
        """Assess format quality."""
        if format_name == 'WAV' and 'PCM' in subtype:
            return "excellent"
        elif format_name == 'FLAC':
            return "excellent"
        elif format_name in ['WAV', 'AIFF']:
            return "good"
        elif format_name == 'MP3':
            return "adequate"
        else:
            return "unknown"
    
    def _get_quality_recommendations(self, info) -> List[str]:
        """Get recommendations for improving audio quality."""
        recommendations = []
        
        if info.samplerate < 16000:
            recommendations.append("Consider using higher sample rate (16kHz+) for better transcription")
        
        if info.channels > 1:
            recommendations.append("Convert to mono for speech recognition")
        
        if info.duration < 1.0:
            recommendations.append("Audio is very short - ensure it contains speech")
        elif info.duration > 3600:
            recommendations.append("Consider splitting long audio into smaller segments")
        
        return recommendations
    
    def _needs_amplitude_normalization(self, file_path: str) -> bool:
        """Check if audio needs amplitude normalization."""
        try:
            # Read a small sample to check amplitude
            data, _ = sf.read(file_path, frames=16000)  # Read first second
            max_amplitude = np.max(np.abs(data))
            
            # Needs normalization if too quiet or too loud
            return max_amplitude < 0.1 or max_amplitude > 0.98
            
        except Exception:
            return True  # Assume normalization needed if can't check


# Create handler instance
def create_audio_file_handler() -> Optional[AudioFileHandler]:
    """Create and return audio file handler instance."""
    try:
        handler = AudioFileHandler()
        if handler.available:
            logger.info("✅ Audio file handler created successfully")
            return handler
        else:
            logger.warning("⚠️ Audio file handler not available")
            return None
    except Exception as e:
        logger.error(f"❌ Failed to create audio file handler: {e}")
        return None