Spaces:
Running
Running
""" | |
Audio File Handler for GAIA Agent | |
Provides comprehensive audio file processing capabilities including: | |
- Multi-format audio file processing and conversion | |
- Audio normalization and quality enhancement | |
- Metadata extraction and validation | |
- Streaming support for large files | |
""" | |
import os | |
import logging | |
import tempfile | |
import shutil | |
from typing import Dict, Any, Optional, List, Tuple, Union | |
from pathlib import Path | |
import json | |
try: | |
import soundfile as sf | |
import numpy as np | |
AUDIO_DEPS_AVAILABLE = True | |
except ImportError as e: | |
logging.warning(f"Audio dependencies not available: {e}") | |
AUDIO_DEPS_AVAILABLE = False | |
logger = logging.getLogger(__name__) | |
class AudioFileHandler: | |
""" | |
Comprehensive audio file handler for GAIA evaluation tasks. | |
Features: | |
- Multi-format support (MP3, WAV, M4A, FLAC, OGG, AAC, WMA) | |
- Audio conversion and normalization | |
- Metadata extraction and validation | |
- Quality assessment and enhancement | |
- Streaming support for large files | |
- Error handling and recovery | |
""" | |
def __init__(self): | |
"""Initialize the audio file handler.""" | |
self.available = AUDIO_DEPS_AVAILABLE | |
self.supported_formats = ['.mp3', '.wav', '.m4a', '.flac', '.ogg', '.aac', '.wma', '.webm'] | |
self.max_file_size = 100 * 1024 * 1024 # 100MB | |
self.temp_dir = None | |
# Audio processing parameters | |
self.target_sample_rate = 16000 # Optimal for Whisper | |
self.target_channels = 1 # Mono for speech recognition | |
self.quality_threshold = 0.7 # Minimum quality score | |
if not self.available: | |
logger.warning("⚠️ Audio file handler not available - missing dependencies") | |
else: | |
logger.info("✅ Audio file handler initialized") | |
def validate_audio_file(self, file_path: str) -> Dict[str, Any]: | |
""" | |
Comprehensive audio file validation. | |
Args: | |
file_path: Path to the audio file | |
Returns: | |
Dictionary with validation results and file information | |
""" | |
try: | |
path = Path(file_path) | |
validation_result = { | |
'valid': False, | |
'file_exists': False, | |
'format_supported': False, | |
'size_acceptable': False, | |
'readable': False, | |
'info': {}, | |
'errors': [], | |
'warnings': [] | |
} | |
# Check if file exists | |
if not path.exists(): | |
validation_result['errors'].append(f"File not found: {file_path}") | |
return validation_result | |
validation_result['file_exists'] = True | |
# Check file size | |
file_size = path.stat().st_size | |
if file_size == 0: | |
validation_result['errors'].append("File is empty") | |
return validation_result | |
if file_size > self.max_file_size: | |
validation_result['errors'].append( | |
f"File too large: {file_size / (1024*1024):.1f}MB (max: {self.max_file_size / (1024*1024)}MB)" | |
) | |
return validation_result | |
validation_result['size_acceptable'] = True | |
# Check file format | |
file_ext = path.suffix.lower() | |
if file_ext not in self.supported_formats: | |
validation_result['errors'].append( | |
f"Unsupported format: {file_ext}. Supported: {', '.join(self.supported_formats)}" | |
) | |
return validation_result | |
validation_result['format_supported'] = True | |
# Try to read audio file and extract metadata | |
try: | |
if not self.available: | |
validation_result['errors'].append("Audio processing dependencies not available") | |
return validation_result | |
info = sf.info(file_path) | |
audio_info = { | |
'duration': info.duration, | |
'sample_rate': info.samplerate, | |
'channels': info.channels, | |
'frames': info.frames, | |
'format': info.format, | |
'subtype': info.subtype, | |
'file_size_mb': file_size / (1024 * 1024) | |
} | |
validation_result['info'] = audio_info | |
validation_result['readable'] = True | |
# Quality checks | |
if info.duration < 0.1: | |
validation_result['warnings'].append("Very short audio duration") | |
elif info.duration > 3600: # 1 hour | |
validation_result['warnings'].append("Very long audio file - processing may take time") | |
if info.samplerate < 8000: | |
validation_result['warnings'].append("Low sample rate - may affect transcription quality") | |
validation_result['valid'] = True | |
except Exception as e: | |
validation_result['errors'].append(f"Cannot read audio file: {str(e)}") | |
return validation_result | |
logger.info(f"✅ Audio file validation successful: {file_path}") | |
return validation_result | |
except Exception as e: | |
logger.error(f"❌ Audio file validation failed: {e}") | |
return { | |
'valid': False, | |
'errors': [f"Validation error: {str(e)}"], | |
'file_exists': False, | |
'format_supported': False, | |
'size_acceptable': False, | |
'readable': False, | |
'info': {} | |
} | |
def normalize_audio(self, file_path: str, output_path: Optional[str] = None) -> Dict[str, Any]: | |
""" | |
Normalize audio file for optimal speech recognition. | |
Args: | |
file_path: Path to input audio file | |
output_path: Path for normalized output (optional, creates temp file if None) | |
Returns: | |
Dictionary with normalization results | |
""" | |
try: | |
if not self.available: | |
return { | |
'success': False, | |
'error': 'Audio processing dependencies not available', | |
'output_path': None | |
} | |
logger.info(f"🔧 Normalizing audio file: {file_path}") | |
# Validate input file | |
validation = self.validate_audio_file(file_path) | |
if not validation['valid']: | |
return { | |
'success': False, | |
'error': f"Invalid input file: {validation['errors']}", | |
'output_path': None | |
} | |
# Read audio data | |
data, sample_rate = sf.read(file_path) | |
# Convert to mono if stereo | |
if len(data.shape) > 1 and data.shape[1] > 1: | |
data = np.mean(data, axis=1) | |
logger.info("🔄 Converted stereo to mono") | |
# Normalize amplitude | |
if np.max(np.abs(data)) > 0: | |
data = data / np.max(np.abs(data)) * 0.95 | |
logger.info("🔄 Normalized amplitude") | |
# Resample if necessary | |
if sample_rate != self.target_sample_rate: | |
# Simple resampling (for more advanced resampling, would need librosa) | |
logger.info(f"🔄 Sample rate: {sample_rate} Hz (target: {self.target_sample_rate} Hz)") | |
# Note: For production, implement proper resampling with librosa | |
# Create output path if not provided | |
if output_path is None: | |
if self.temp_dir is None: | |
self.temp_dir = tempfile.mkdtemp(prefix="gaia_audio_") | |
output_path = os.path.join( | |
self.temp_dir, | |
f"normalized_{Path(file_path).stem}.wav" | |
) | |
# Write normalized audio | |
sf.write(output_path, data, sample_rate) | |
# Validate output | |
output_validation = self.validate_audio_file(output_path) | |
result = { | |
'success': True, | |
'output_path': output_path, | |
'original_info': validation['info'], | |
'normalized_info': output_validation['info'] if output_validation['valid'] else {}, | |
'changes_made': [] | |
} | |
# Document changes | |
if len(data.shape) == 1 or data.shape[1] == 1: | |
result['changes_made'].append('converted_to_mono') | |
result['changes_made'].append('normalized_amplitude') | |
if sample_rate != self.target_sample_rate: | |
result['changes_made'].append('resampled') | |
logger.info(f"✅ Audio normalization completed: {output_path}") | |
return result | |
except Exception as e: | |
logger.error(f"❌ Audio normalization failed: {e}") | |
return { | |
'success': False, | |
'error': f"Normalization failed: {str(e)}", | |
'output_path': None | |
} | |
def extract_metadata(self, file_path: str) -> Dict[str, Any]: | |
""" | |
Extract comprehensive metadata from audio file. | |
Args: | |
file_path: Path to audio file | |
Returns: | |
Dictionary with extracted metadata | |
""" | |
try: | |
if not self.available: | |
return { | |
'success': False, | |
'error': 'Audio processing dependencies not available', | |
'metadata': {} | |
} | |
logger.info(f"📊 Extracting metadata from: {file_path}") | |
# Basic file information | |
path = Path(file_path) | |
file_stats = path.stat() | |
metadata = { | |
'file_info': { | |
'name': path.name, | |
'size_bytes': file_stats.st_size, | |
'size_mb': file_stats.st_size / (1024 * 1024), | |
'extension': path.suffix.lower(), | |
'created': file_stats.st_ctime, | |
'modified': file_stats.st_mtime | |
}, | |
'audio_info': {}, | |
'quality_assessment': {} | |
} | |
# Audio-specific information | |
try: | |
info = sf.info(file_path) | |
metadata['audio_info'] = { | |
'duration_seconds': info.duration, | |
'duration_formatted': self._format_duration(info.duration), | |
'sample_rate': info.samplerate, | |
'channels': info.channels, | |
'frames': info.frames, | |
'format': info.format, | |
'subtype': info.subtype, | |
'bits_per_sample': self._get_bits_per_sample(info.subtype) | |
} | |
# Quality assessment | |
quality_score = self._assess_audio_quality(info) | |
metadata['quality_assessment'] = { | |
'overall_score': quality_score, | |
'sample_rate_quality': self._assess_sample_rate(info.samplerate), | |
'duration_quality': self._assess_duration(info.duration), | |
'format_quality': self._assess_format(info.format, info.subtype), | |
'recommendations': self._get_quality_recommendations(info) | |
} | |
except Exception as e: | |
metadata['audio_info'] = {'error': f"Could not read audio info: {str(e)}"} | |
metadata['quality_assessment'] = {'error': str(e)} | |
logger.info(f"✅ Metadata extraction completed") | |
return { | |
'success': True, | |
'metadata': metadata | |
} | |
except Exception as e: | |
logger.error(f"❌ Metadata extraction failed: {e}") | |
return { | |
'success': False, | |
'error': f"Metadata extraction failed: {str(e)}", | |
'metadata': {} | |
} | |
def prepare_for_transcription(self, file_path: str) -> Dict[str, Any]: | |
""" | |
Prepare audio file for optimal transcription quality. | |
Args: | |
file_path: Path to input audio file | |
Returns: | |
Dictionary with preparation results and optimized file path | |
""" | |
try: | |
logger.info(f"🎯 Preparing audio for transcription: {file_path}") | |
# Validate input | |
validation = self.validate_audio_file(file_path) | |
if not validation['valid']: | |
return { | |
'success': False, | |
'error': f"Invalid audio file: {validation['errors']}", | |
'prepared_file': None, | |
'original_file': file_path | |
} | |
# Check if normalization is needed | |
info = validation['info'] | |
needs_normalization = ( | |
info.get('channels', 1) > 1 or # Stereo to mono | |
info.get('sample_rate', 16000) != self.target_sample_rate or # Resample | |
self._needs_amplitude_normalization(file_path) # Amplitude normalization | |
) | |
if not needs_normalization: | |
logger.info("✅ Audio file already optimal for transcription") | |
return { | |
'success': True, | |
'prepared_file': file_path, | |
'original_file': file_path, | |
'normalization_applied': False, | |
'info': info | |
} | |
# Apply normalization | |
normalization_result = self.normalize_audio(file_path) | |
if not normalization_result['success']: | |
return { | |
'success': False, | |
'error': f"Normalization failed: {normalization_result['error']}", | |
'prepared_file': None, | |
'original_file': file_path | |
} | |
return { | |
'success': True, | |
'prepared_file': normalization_result['output_path'], | |
'original_file': file_path, | |
'normalization_applied': True, | |
'changes_made': normalization_result['changes_made'], | |
'original_info': normalization_result['original_info'], | |
'normalized_info': normalization_result['normalized_info'] | |
} | |
except Exception as e: | |
logger.error(f"❌ Audio preparation failed: {e}") | |
return { | |
'success': False, | |
'error': f"Preparation failed: {str(e)}", | |
'prepared_file': None, | |
'original_file': file_path | |
} | |
def cleanup_temp_files(self): | |
"""Clean up temporary files created during processing.""" | |
try: | |
if self.temp_dir and os.path.exists(self.temp_dir): | |
shutil.rmtree(self.temp_dir) | |
self.temp_dir = None | |
logger.info("🧹 Temporary files cleaned up") | |
except Exception as e: | |
logger.warning(f"⚠️ Failed to cleanup temp files: {e}") | |
def _format_duration(self, duration_seconds: float) -> str: | |
"""Format duration in human-readable format.""" | |
hours = int(duration_seconds // 3600) | |
minutes = int((duration_seconds % 3600) // 60) | |
seconds = int(duration_seconds % 60) | |
if hours > 0: | |
return f"{hours:02d}:{minutes:02d}:{seconds:02d}" | |
else: | |
return f"{minutes:02d}:{seconds:02d}" | |
def _get_bits_per_sample(self, subtype: str) -> int: | |
"""Get bits per sample from subtype.""" | |
subtype_bits = { | |
'PCM_16': 16, | |
'PCM_24': 24, | |
'PCM_32': 32, | |
'FLOAT': 32, | |
'DOUBLE': 64 | |
} | |
return subtype_bits.get(subtype, 16) | |
def _assess_audio_quality(self, info) -> float: | |
"""Assess overall audio quality for transcription (0-1 score).""" | |
score = 1.0 | |
# Sample rate assessment | |
if info.samplerate < 8000: | |
score -= 0.3 | |
elif info.samplerate < 16000: | |
score -= 0.1 | |
# Duration assessment | |
if info.duration < 1.0: | |
score -= 0.2 | |
elif info.duration > 3600: | |
score -= 0.1 | |
# Channel assessment (mono is better for speech) | |
if info.channels > 1: | |
score -= 0.1 | |
return max(0.0, score) | |
def _assess_sample_rate(self, sample_rate: int) -> str: | |
"""Assess sample rate quality.""" | |
if sample_rate >= 44100: | |
return "excellent" | |
elif sample_rate >= 22050: | |
return "good" | |
elif sample_rate >= 16000: | |
return "adequate" | |
elif sample_rate >= 8000: | |
return "poor" | |
else: | |
return "very_poor" | |
def _assess_duration(self, duration: float) -> str: | |
"""Assess duration quality.""" | |
if 10 <= duration <= 1800: # 10 seconds to 30 minutes | |
return "optimal" | |
elif 1 <= duration <= 3600: # 1 second to 1 hour | |
return "good" | |
elif duration < 1: | |
return "too_short" | |
else: | |
return "very_long" | |
def _assess_format(self, format_name: str, subtype: str) -> str: | |
"""Assess format quality.""" | |
if format_name == 'WAV' and 'PCM' in subtype: | |
return "excellent" | |
elif format_name == 'FLAC': | |
return "excellent" | |
elif format_name in ['WAV', 'AIFF']: | |
return "good" | |
elif format_name == 'MP3': | |
return "adequate" | |
else: | |
return "unknown" | |
def _get_quality_recommendations(self, info) -> List[str]: | |
"""Get recommendations for improving audio quality.""" | |
recommendations = [] | |
if info.samplerate < 16000: | |
recommendations.append("Consider using higher sample rate (16kHz+) for better transcription") | |
if info.channels > 1: | |
recommendations.append("Convert to mono for speech recognition") | |
if info.duration < 1.0: | |
recommendations.append("Audio is very short - ensure it contains speech") | |
elif info.duration > 3600: | |
recommendations.append("Consider splitting long audio into smaller segments") | |
return recommendations | |
def _needs_amplitude_normalization(self, file_path: str) -> bool: | |
"""Check if audio needs amplitude normalization.""" | |
try: | |
# Read a small sample to check amplitude | |
data, _ = sf.read(file_path, frames=16000) # Read first second | |
max_amplitude = np.max(np.abs(data)) | |
# Needs normalization if too quiet or too loud | |
return max_amplitude < 0.1 or max_amplitude > 0.98 | |
except Exception: | |
return True # Assume normalization needed if can't check | |
# Create handler instance | |
def create_audio_file_handler() -> Optional[AudioFileHandler]: | |
"""Create and return audio file handler instance.""" | |
try: | |
handler = AudioFileHandler() | |
if handler.available: | |
logger.info("✅ Audio file handler created successfully") | |
return handler | |
else: | |
logger.warning("⚠️ Audio file handler not available") | |
return None | |
except Exception as e: | |
logger.error(f"❌ Failed to create audio file handler: {e}") | |
return None |