Spaces:

dtrovato997
/

SpeechAnalysisDemo

Paused

App Files Files Community

dtrovato997 commited on May 28

Commit

5277669

1 Parent(s): ead33a6

fix : clip audio to max 2 mins

Browse files

Files changed (3) hide show

main.py +67 -19
models/age_and_gender_model.py +3 -7
models/nationality_model.py +0 -5

main.py CHANGED Viewed

@@ -20,6 +20,7 @@ logger = logging.getLogger(__name__)
 UPLOAD_FOLDER = 'uploads'
 ALLOWED_EXTENSIONS = {'wav', 'mp3', 'flac', 'm4a'}
 SAMPLING_RATE = 16000
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
@@ -31,6 +32,23 @@ def allowed_file(filename: str) -> bool:
     return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
 async def load_models() -> bool:
     global age_gender_model, nationality_model
@@ -96,10 +114,11 @@ app = FastAPI(
     lifespan=lifespan
 )
-def preprocess_audio(audio_data: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
     preprocess_start = time.time()
     original_shape = audio_data.shape
-    logger.info(f"Starting audio preprocessing - Original shape: {original_shape}, Sample rate: {sr}Hz")
     # Convert to mono if stereo
     if len(audio_data.shape) > 1:
@@ -115,20 +134,24 @@ def preprocess_audio(audio_data: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
         audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=SAMPLING_RATE)
         resample_end = time.time()
         logger.info(f"Resampling completed in {resample_end - resample_start:.3f} seconds")
     else:
         logger.info(f"No resampling needed - already at {SAMPLING_RATE}Hz")
     # Convert to float32
     audio_data = audio_data.astype(np.float32)
     preprocess_end = time.time()
-    duration_seconds = len(audio_data) / SAMPLING_RATE
     logger.info(f"Audio preprocessing completed in {preprocess_end - preprocess_start:.3f} seconds")
-    logger.info(f"Final audio: {audio_data.shape} samples, {duration_seconds:.2f} seconds duration")
-    return audio_data, SAMPLING_RATE
-async def process_audio_file(file: UploadFile) -> tuple[np.ndarray, int]:
     process_start = time.time()
     logger.info(f"Processing uploaded file: {file.filename}")
@@ -165,12 +188,12 @@ async def process_audio_file(file: UploadFile) -> tuple[np.ndarray, int]:
         load_end = time.time()
         logger.info(f"Audio loaded in {load_end - load_start:.3f} seconds")
-        processed_audio, processed_sr = preprocess_audio(audio_data, sr)
         process_end = time.time()
         logger.info(f"Total file processing completed in {process_end - process_start:.3f} seconds")
-        return processed_audio, processed_sr
     except Exception as e:
         logger.error(f"Error processing audio file {file.filename}: {str(e)}")
@@ -186,6 +209,7 @@ async def root() -> Dict[str, Any]:
     logger.info("Root endpoint accessed")
     return {
         "message": "Audio Analysis API - Age, Gender & Nationality Prediction",
         "models_loaded": {
             "age_gender": age_gender_model is not None and hasattr(age_gender_model, 'model') and age_gender_model.model is not None,
             "nationality": nationality_model is not None and hasattr(nationality_model, 'model') and nationality_model.model is not None
@@ -206,7 +230,6 @@ async def health_check() -> Dict[str, str]:
 @app.post("/predict_age_and_gender")
 async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]:
-    """Predict age and gender from uploaded audio file."""
     endpoint_start = time.time()
     logger.info(f"Age & Gender prediction requested for file: {file.filename}")
@@ -215,7 +238,7 @@ async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]
         raise HTTPException(status_code=500, detail="Age & gender model not loaded")
     try:
-        processed_audio, processed_sr = await process_audio_file(file)
         # Make prediction
         prediction_start = time.time()
@@ -230,12 +253,21 @@ async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]
         endpoint_end = time.time()
         logger.info(f"Total age & gender endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
-        return {
             "success": True,
             "predictions": predictions,
-            "processing_time": round(endpoint_end - endpoint_start, 3)
         }
     except HTTPException:
         raise
     except Exception as e:
@@ -244,7 +276,6 @@ async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]
 @app.post("/predict_nationality")
 async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
-    """Predict nationality/language from uploaded audio file."""
     endpoint_start = time.time()
     logger.info(f"Nationality prediction requested for file: {file.filename}")
@@ -253,7 +284,7 @@ async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
         raise HTTPException(status_code=500, detail="Nationality model not loaded")
     try:
-        processed_audio, processed_sr = await process_audio_file(file)
         # Make prediction
         prediction_start = time.time()
@@ -268,12 +299,21 @@ async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
         endpoint_end = time.time()
         logger.info(f"Total nationality endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
-        return {
             "success": True,
             "predictions": predictions,
-            "processing_time": round(endpoint_end - endpoint_start, 3)
         }
     except HTTPException:
         raise
     except Exception as e:
@@ -282,7 +322,6 @@ async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
 @app.post("/predict_all")
 async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
-    """Predict age, gender, and nationality from uploaded audio file."""
     endpoint_start = time.time()
     logger.info(f"Complete analysis requested for file: {file.filename}")
@@ -295,7 +334,7 @@ async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
         raise HTTPException(status_code=500, detail="Nationality model not loaded")
     try:
-        processed_audio, processed_sr = await process_audio_file(file)
         # Get age & gender predictions
         age_prediction_start = time.time()
@@ -323,7 +362,7 @@ async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
         logger.info(f"Total prediction time: {total_prediction_time:.3f} seconds")
         logger.info(f"Total complete analysis endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
-        return {
             "success": True,
             "predictions": {
                 "demographics": age_gender_predictions,
@@ -333,9 +372,18 @@ async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
                 "total": round(endpoint_end - endpoint_start, 3),
                 "age_gender": round(age_prediction_end - age_prediction_start, 3),
                 "nationality": round(nationality_prediction_end - nationality_prediction_start, 3)
             }
         }
     except HTTPException:
         raise
     except Exception as e:

 UPLOAD_FOLDER = 'uploads'
 ALLOWED_EXTENSIONS = {'wav', 'mp3', 'flac', 'm4a'}
 SAMPLING_RATE = 16000
+MAX_DURATION_SECONDS = 120  # 2 minutes maximum
 os.makedirs(UPLOAD_FOLDER, exist_ok=True)
     return '.' in filename and \
            filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+def clip_audio_to_max_duration(audio_data: np.ndarray, sr: int, max_duration: int = MAX_DURATION_SECONDS) -> tuple[np.ndarray, bool]:
+    current_duration = len(audio_data) / sr
+    if current_duration <= max_duration:
+        logger.info(f"Audio duration ({current_duration:.2f}s) is within limit ({max_duration}s) - no clipping needed")
+        return audio_data, False
+    # Calculate how many samples we need for the max duration
+    max_samples = int(max_duration * sr)
+    # Clip to first max_duration seconds
+    clipped_audio = audio_data[:max_samples]
+    logger.info(f"Audio clipped from {current_duration:.2f}s to {max_duration}s ({len(audio_data)} samples → {len(clipped_audio)} samples)")
+    return clipped_audio, True
 async def load_models() -> bool:
     global age_gender_model, nationality_model
     lifespan=lifespan
 )
+def preprocess_audio(audio_data: np.ndarray, sr: int) -> tuple[np.ndarray, int, bool]:
     preprocess_start = time.time()
     original_shape = audio_data.shape
+    original_duration = len(audio_data) / sr
+    logger.info(f"Starting audio preprocessing Sample rate: {sr}Hz, Duration: {original_duration:.2f}s")
     # Convert to mono if stereo
     if len(audio_data.shape) > 1:
         audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=SAMPLING_RATE)
         resample_end = time.time()
         logger.info(f"Resampling completed in {resample_end - resample_start:.3f} seconds")
+        sr = SAMPLING_RATE
     else:
         logger.info(f"No resampling needed - already at {SAMPLING_RATE}Hz")
+    # Clip audio to maximum duration if needed
+    audio_data, was_clipped = clip_audio_to_max_duration(audio_data, sr)
     # Convert to float32
     audio_data = audio_data.astype(np.float32)
     preprocess_end = time.time()
+    final_duration_seconds = len(audio_data) / sr
     logger.info(f"Audio preprocessing completed in {preprocess_end - preprocess_start:.3f} seconds")
+    logger.info(f"Final audio: {audio_data.shape} samples, {final_duration_seconds:.2f} seconds duration")
+    return audio_data, sr, was_clipped
+async def process_audio_file(file: UploadFile) -> tuple[np.ndarray, int, bool]:
     process_start = time.time()
     logger.info(f"Processing uploaded file: {file.filename}")
         load_end = time.time()
         logger.info(f"Audio loaded in {load_end - load_start:.3f} seconds")
+        processed_audio, processed_sr, was_clipped = preprocess_audio(audio_data, sr)
         process_end = time.time()
         logger.info(f"Total file processing completed in {process_end - process_start:.3f} seconds")
+        return processed_audio, processed_sr, was_clipped
     except Exception as e:
         logger.error(f"Error processing audio file {file.filename}: {str(e)}")
     logger.info("Root endpoint accessed")
     return {
         "message": "Audio Analysis API - Age, Gender & Nationality Prediction",
+        "max_audio_duration": f"{MAX_DURATION_SECONDS} seconds (files longer than this will be automatically clipped)",
         "models_loaded": {
             "age_gender": age_gender_model is not None and hasattr(age_gender_model, 'model') and age_gender_model.model is not None,
             "nationality": nationality_model is not None and hasattr(nationality_model, 'model') and nationality_model.model is not None
 @app.post("/predict_age_and_gender")
 async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]:
     endpoint_start = time.time()
     logger.info(f"Age & Gender prediction requested for file: {file.filename}")
         raise HTTPException(status_code=500, detail="Age & gender model not loaded")
     try:
+        processed_audio, processed_sr, was_clipped = await process_audio_file(file)
         # Make prediction
         prediction_start = time.time()
         endpoint_end = time.time()
         logger.info(f"Total age & gender endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
+        response = {
             "success": True,
             "predictions": predictions,
+            "processing_time": round(endpoint_end - endpoint_start, 3),
+            "audio_info": {
+                "was_clipped": was_clipped,
+                "max_duration_seconds": MAX_DURATION_SECONDS
+            }
         }
+        if was_clipped:
+            response["warning"] = f"Audio was longer than {MAX_DURATION_SECONDS} seconds and was automatically clipped to the first {MAX_DURATION_SECONDS} seconds for analysis."
+        return response
     except HTTPException:
         raise
     except Exception as e:
 @app.post("/predict_nationality")
 async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
     endpoint_start = time.time()
     logger.info(f"Nationality prediction requested for file: {file.filename}")
         raise HTTPException(status_code=500, detail="Nationality model not loaded")
     try:
+        processed_audio, processed_sr, was_clipped = await process_audio_file(file)
         # Make prediction
         prediction_start = time.time()
         endpoint_end = time.time()
         logger.info(f"Total nationality endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
+        response = {
             "success": True,
             "predictions": predictions,
+            "processing_time": round(endpoint_end - endpoint_start, 3),
+            "audio_info": {
+                "was_clipped": was_clipped,
+                "max_duration_seconds": MAX_DURATION_SECONDS
+            }
         }
+        if was_clipped:
+            response["warning"] = f"Audio was longer than {MAX_DURATION_SECONDS} seconds and was automatically clipped to the first {MAX_DURATION_SECONDS} seconds for analysis."
+        return response
     except HTTPException:
         raise
     except Exception as e:
 @app.post("/predict_all")
 async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
     endpoint_start = time.time()
     logger.info(f"Complete analysis requested for file: {file.filename}")
         raise HTTPException(status_code=500, detail="Nationality model not loaded")
     try:
+        processed_audio, processed_sr, was_clipped = await process_audio_file(file)
         # Get age & gender predictions
         age_prediction_start = time.time()
         logger.info(f"Total prediction time: {total_prediction_time:.3f} seconds")
         logger.info(f"Total complete analysis endpoint processing time: {endpoint_end - endpoint_start:.3f} seconds")
+        response = {
             "success": True,
             "predictions": {
                 "demographics": age_gender_predictions,
                 "total": round(endpoint_end - endpoint_start, 3),
                 "age_gender": round(age_prediction_end - age_prediction_start, 3),
                 "nationality": round(nationality_prediction_end - nationality_prediction_start, 3)
+            },
+            "audio_info": {
+                "was_clipped": was_clipped,
+                "max_duration_seconds": MAX_DURATION_SECONDS
             }
         }
+        if was_clipped:
+            response["warning"] = f"Audio was longer than {MAX_DURATION_SECONDS} seconds and was automatically clipped to the first {MAX_DURATION_SECONDS} seconds for analysis."
+        return response
     except HTTPException:
         raise
     except Exception as e:

models/age_and_gender_model.py CHANGED Viewed

@@ -7,7 +7,6 @@ import librosa
 class AgeGenderModel:
     def __init__(self, model_path=None):
-        # Use persistent storage if available, fallback to local cache
         if model_path is None:
             if os.path.exists("/data"):
                 # HF Spaces persistent storage
@@ -34,7 +33,7 @@ class AgeGenderModel:
         print("Age & gender model files not found. Downloading...")
         try:
-            # Use /data for cache if available, otherwise use local cache
             if os.path.exists("/data"):
                 cache_root = '/data/cache'
             else:
@@ -72,16 +71,13 @@ class AgeGenderModel:
     def load(self):
         try:
-            # Download model if needed
             if not self.download_model():
                 print("Failed to download age & gender model")
                 return False
-            # Load the audonnx model
             print(f"Loading age & gender model from {self.model_path}...")
             self.model = audonnx.load(self.model_path)
-            # Create the audinterface Feature interface
             outputs = ['logits_age', 'logits_gender']
             self.interface = audinterface.Feature(
                 self.model.labels(outputs),
@@ -91,7 +87,7 @@ class AgeGenderModel:
                     'concat': True,
                 },
                 sampling_rate=self.sampling_rate,
-                resample=False,  # We handle resampling manually
                 verbose=False,
             )
             print("Age & gender model loaded successfully!")
@@ -105,7 +101,7 @@ class AgeGenderModel:
         if self.model is None or self.interface is None:
             raise ValueError("Model not loaded. Call load() first.")
-        try:            # Process with the interface
             result = self.interface.process_signal(audio_data, sr)
             # Extract and process results

 class AgeGenderModel:
     def __init__(self, model_path=None):
         if model_path is None:
             if os.path.exists("/data"):
                 # HF Spaces persistent storage
         print("Age & gender model files not found. Downloading...")
         try:
+            # Use /data for cache if available, otherwise use local cache, this i nline with HF Spaces persistent storage
             if os.path.exists("/data"):
                 cache_root = '/data/cache'
             else:
     def load(self):
         try:
             if not self.download_model():
                 print("Failed to download age & gender model")
                 return False
             print(f"Loading age & gender model from {self.model_path}...")
             self.model = audonnx.load(self.model_path)
             outputs = ['logits_age', 'logits_gender']
             self.interface = audinterface.Feature(
                 self.model.labels(outputs),
                     'concat': True,
                 },
                 sampling_rate=self.sampling_rate,
+                resample=False,
                 verbose=False,
             )
             print("Age & gender model loaded successfully!")
         if self.model is None or self.interface is None:
             raise ValueError("Model not loaded. Call load() first.")
+        try:
             result = self.interface.process_signal(audio_data, sr)
             # Extract and process results

models/nationality_model.py CHANGED Viewed

@@ -9,7 +9,6 @@ SAMPLING_RATE = 16000
 class NationalityModel:
     def __init__(self, cache_dir=None):
-        # Use persistent storage if available, fallback to local cache
         if cache_dir is None:
             if os.path.exists("/data"):
                 # HF Spaces persistent storage
@@ -48,16 +47,13 @@ class NationalityModel:
             raise ValueError("Model not loaded. Call load() first.")
         try:
-            # Ensure audio is properly formatted (float32, mono)
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=0)
             audio_data = audio_data.astype(np.float32)
-            # Process audio with the feature extractor
             inputs = self.processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
-            # Get model predictions
             with torch.no_grad():
                 outputs = self.model(**inputs).logits
@@ -65,7 +61,6 @@ class NationalityModel:
             probabilities = torch.nn.functional.softmax(outputs, dim=-1)[0]
             top_k_values, top_k_indices = torch.topk(probabilities, k=5)
-            # Convert to language codes and probabilities
             top_languages = []
             for i, idx in enumerate(top_k_indices):
                 lang_id = idx.item()

 class NationalityModel:
     def __init__(self, cache_dir=None):
         if cache_dir is None:
             if os.path.exists("/data"):
                 # HF Spaces persistent storage
             raise ValueError("Model not loaded. Call load() first.")
         try:
             if len(audio_data.shape) > 1:
                 audio_data = audio_data.mean(axis=0)
             audio_data = audio_data.astype(np.float32)
             inputs = self.processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
             with torch.no_grad():
                 outputs = self.model(**inputs).logits
             probabilities = torch.nn.functional.softmax(outputs, dim=-1)[0]
             top_k_values, top_k_indices = torch.topk(probabilities, k=5)
             top_languages = []
             for i, idx in enumerate(top_k_indices):
                 lang_id = idx.item()