Spaces:

dtrovato997
/

SpeechAnalysisDemo

Paused

App Files Files Community

dtrovato997 commited on May 27

Commit

d7e7912

1 Parent(s): 7c86e3c

Initial commit

Browse files

Files changed (6) hide show

.gitignore +97 -0
Dockerfile +29 -0
app.py +220 -0
models/age_and_gender_model.py +128 -0
models/nationality_model.py +73 -0
requirements.txt +12 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,97 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# Virtual environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Model cache directories (these will be large!)
+cache/
+models/cache/
+*.pt
+*.pth
+*.bin
+*.onnx
+*.h5
+*.pkl
+*.joblib
+# Audio uploads (temporary files)
+uploads/
+*.wav
+*.mp3
+*.flac
+*.m4a
+*.ogg
+# IDE/Editor files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+.DS_Store
+Thumbs.db
+# Logs
+*.log
+logs/
+# Environment variables
+.env.local
+.env.development.local
+.env.test.local
+.env.production.local
+# Jupyter Notebook
+.ipynb_checkpoints
+# Flask specific
+instance/
+.webassets-cache
+# Coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Temporary files
+*.tmp
+*.temp
+.tmp/
+.temp/

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+FROM python:3.9-slim
+# Install system dependencies for audio processing
+RUN apt-get update && apt-get install -y \
+    ffmpeg \
+    libsndfile1 \
+    gcc \
+    g++ \
+    curl \
+    && rm -rf /var/lib/apt/lists/*
+# Set working directory
+WORKDIR /app
+# Copy requirements and install Python dependencies
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create directories
+RUN mkdir -p uploads cache
+# Expose port 7860 (HF Spaces default)
+EXPOSE 7860
+# Start the Flask app
+CMD ["python", "app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+from fastapi import FastAPI, File, UploadFile, HTTPException
+from fastapi.responses import JSONResponse
+import os
+import numpy as np
+import librosa
+from typing import Dict, Any
+import logging
+from contextlib import asynccontextmanager
+from models.nationality_model import NationalityModel
+from models.age_and_gender_model import AgeGenderModel
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+UPLOAD_FOLDER = 'uploads'
+ALLOWED_EXTENSIONS = {'wav', 'mp3', 'flac', 'm4a'}
+SAMPLING_RATE = 16000
+os.makedirs(UPLOAD_FOLDER, exist_ok=True)
+# Global model variables
+age_gender_model = None
+nationality_model = None
+def allowed_file(filename: str) -> bool:
+    return '.' in filename and \
+           filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS
+async def load_models() -> bool:
+    global age_gender_model, nationality_model
+    try:
+        # Load age & gender model
+        logger.info("Loading age & gender model...")
+        age_gender_model = AgeGenderModel()
+        age_gender_success = age_gender_model.load()
+        if not age_gender_success:
+            logger.error("Failed to load age & gender model")
+            return False
+        # Load nationality model
+        logger.info("Loading nationality model...")
+        nationality_model = NationalityModel()
+        nationality_success = nationality_model.load()
+        if not nationality_success:
+            logger.error("Failed to load nationality model")
+            return False
+        logger.info("All models loaded successfully!")
+        return True
+    except Exception as e:
+        logger.error(f"Error loading models: {e}")
+        return False
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    # Startup
+    logger.info("Starting FastAPI application...")
+    # success = await load_models()
+    succes = true
+    if not success:
+        logger.error("Failed to load models. Application will not work properly.")
+    yield
+    # Shutdown
+    logger.info("Shutting down FastAPI application...")
+# Create FastAPI app with lifespan events
+app = FastAPI(
+    title="Audio Analysis API",
+    description="audio analysis for age, gender, and nationality prediction",
+    version="1.0.0",
+    lifespan=lifespan
+)
+def preprocess_audio(audio_data: np.ndarray, sr: int) -> tuple[np.ndarray, int]:
+    if len(audio_data.shape) > 1:
+        audio_data = librosa.to_mono(audio_data)
+    if sr != SAMPLING_RATE:
+        logger.info(f"Resampling from {sr}Hz to {SAMPLING_RATE}Hz")
+        audio_data = librosa.resample(audio_data, orig_sr=sr, target_sr=SAMPLING_RATE)
+    audio_data = audio_data.astype(np.float32)
+    return audio_data, SAMPLING_RATE
+async def process_audio_file(file: UploadFile) -> tuple[np.ndarray, int]:
+    if not file.filename:
+        raise HTTPException(status_code=400, detail="No file selected")
+    if not allowed_file(file.filename):
+        raise HTTPException(status_code=400, detail="Invalid file type. Allowed: wav, mp3, flac, m4a")
+    # Create a secure filename
+    filename = f"temp_{file.filename}"
+    filepath = os.path.join(UPLOAD_FOLDER, filename)
+    try:
+        # Save uploaded file temporarily
+        with open(filepath, "wb") as buffer:
+            content = await file.read()
+            buffer.write(content)
+        # Load and preprocess audio
+        audio_data, sr = librosa.load(filepath, sr=None)
+        processed_audio, processed_sr = preprocess_audio(audio_data, sr)
+        return processed_audio, processed_sr
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=f"Error processing audio file: {str(e)}")
+    finally:
+        # Clean up temporary file
+        if os.path.exists(filepath):
+            os.remove(filepath)
+@app.get("/")
+async def root() -> Dict[str, Any]:
+    return {
+        "message": "Audio Analysis API - Age, Gender & Nationality Prediction",
+        "models_loaded": {
+            "age_gender": age_gender_model is not None and hasattr(age_gender_model, 'model') and age_gender_model.model is not None,
+            "nationality": nationality_model is not None and hasattr(nationality_model, 'model') and nationality_model.model is not None
+        },
+        "endpoints": {
+            "/predict_age_and_gender": "POST - Upload audio file for age and gender prediction",
+            "/predict_nationality": "POST - Upload audio file for nationality prediction",
+            "/predict_all": "POST - Upload audio file for complete analysis (age, gender, nationality)",
+        },
+        "docs": "/docs - Interactive API documentation",
+        "openapi": "/openapi.json - OpenAPI schema"
+    }
+@app.get("/health")
+async def health_check() -> Dict[str, str]:
+    return {"status": "healthy"}
+@app.post("/predict_age_and_gender")
+async def predict_age_and_gender(file: UploadFile = File(...)) -> Dict[str, Any]:
+    """Predict age and gender from uploaded audio file."""
+    if age_gender_model is None or not hasattr(age_gender_model, 'model') or age_gender_model.model is None:
+        raise HTTPException(status_code=500, detail="Age & gender model not loaded")
+    try:
+        processed_audio, processed_sr = await process_audio_file(file)
+        predictions = age_gender_model.predict(processed_audio, processed_sr)
+        return {
+            "success": True,
+            "predictions": predictions
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/predict_nationality")
+async def predict_nationality(file: UploadFile = File(...)) -> Dict[str, Any]:
+    """Predict nationality/language from uploaded audio file."""
+    if nationality_model is None or not hasattr(nationality_model, 'model') or nationality_model.model is None:
+        raise HTTPException(status_code=500, detail="Nationality model not loaded")
+    try:
+        processed_audio, processed_sr = await process_audio_file(file)
+        predictions = nationality_model.predict(processed_audio, processed_sr)
+        return {
+            "success": True,
+            "predictions": predictions
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/predict_all")
+async def predict_all(file: UploadFile = File(...)) -> Dict[str, Any]:
+    if age_gender_model is None or not hasattr(age_gender_model, 'model') or age_gender_model.model is None:
+        raise HTTPException(status_code=500, detail="Age & gender model not loaded")
+    if nationality_model is None or not hasattr(nationality_model, 'model') or nationality_model.model is None:
+        raise HTTPException(status_code=500, detail="Nationality model not loaded")
+    try:
+        processed_audio, processed_sr = await process_audio_file(file)
+        # Get both predictions
+        age_gender_predictions = age_gender_model.predict(processed_audio, processed_sr)
+        nationality_predictions = nationality_model.predict(processed_audio, processed_sr)
+        return {
+            "success": True,
+            "predictions": {
+                "demographics": age_gender_predictions,
+                "nationality": nationality_predictions
+            }
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
+if __name__ == "__main__":
+    import uvicorn
+    port = int(os.environ.get("PORT", 7860))
+    uvicorn.run(
+        "app:app",
+        host="0.0.0.0",
+        port=port,
+        reload=False,  # Set to True for development
+        log_level="info"
+    )

models/age_and_gender_model.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import os
+import numpy as np
+import audeer
+import audonnx
+import audinterface
+import librosa
+class AgeGenderModel:
+    def __init__(self, model_path="./cache/age_and_gender"):
+        self.model_path = model_path
+        self.model = None
+        self.interface = None
+        self.sampling_rate = 16000
+        os.makedirs(model_path, exist_ok=True)
+    def download_model(self):
+        model_onnx = os.path.join(self.model_path, 'model.onnx')
+        model_yaml = os.path.join(self.model_path, 'model.yaml')
+        if os.path.exists(model_onnx) and os.path.exists(model_yaml):
+            print("Age & gender model files already exist, skipping download.")
+            return True
+        print("Age & gender model files not found. Downloading...")
+        try:
+            cache_root = 'cache'
+            audeer.mkdir(cache_root)
+            audeer.mkdir(self.model_path)
+            def cache_path(file):
+                return os.path.join(cache_root, file)
+            url = 'https://zenodo.org/record/7761387/files/w2v2-L-robust-24-age-gender.728d5a4c-1.1.1.zip'
+            dst_path = cache_path('model.zip')
+            if not os.path.exists(dst_path):
+                print(f"Downloading model from {url}...")
+                audeer.download_url(url, dst_path, verbose=True)
+            print(f"Extracting model to {self.model_path}...")
+            audeer.extract_archive(dst_path, self.model_path, verbose=True)
+            if os.path.exists(model_onnx) and os.path.exists(model_yaml):
+                print("Age & gender model downloaded and extracted successfully!")
+                if os.path.exists(dst_path):
+                    os.remove(dst_path)
+                return True
+            else:
+                print("Age & gender model extraction failed, files not found after extraction")
+                return False
+        except Exception as e:
+            print(f"Error downloading age & gender model: {e}")
+            return False
+    def load(self):
+        try:
+            # Download model if needed
+            if not self.download_model():
+                print("Failed to download age & gender model")
+                return False
+            # Load the audonnx model
+            print("Loading age & gender model...")
+            self.model = audonnx.load(self.model_path)
+            # Create the audinterface Feature interface
+            outputs = ['logits_age', 'logits_gender']
+            self.interface = audinterface.Feature(
+                self.model.labels(outputs),
+                process_func=self.model,
+                process_func_args={
+                    'outputs': outputs,
+                    'concat': True,
+                },
+                sampling_rate=self.sampling_rate,
+                resample=False,  # We handle resampling manually
+                verbose=False,
+            )
+            print("Age & gender model loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"Error loading age & gender model: {e}")
+            return False
+    def predict(self, audio_data, sr):
+        if self.model is None or self.interface is None:
+            raise ValueError("Model not loaded. Call load() first.")
+        try:            # Process with the interface
+            result = self.interface.process_signal(audio_data, sr)
+            # Extract and process results
+            age_score = result['age'].values[0]
+            gender_logits = {
+                'female': result['female'].values[0],
+                'male': result['male'].values[0],
+                'child': result['child'].values[0]
+            }
+            predicted_age = age_score * 100
+            gender_values = np.array(list(gender_logits.values()))
+            gender_probs = np.exp(gender_values) / np.sum(np.exp(gender_values))
+            gender_labels = ['female', 'male', 'child']
+            gender_probabilities = {
+                label: float(prob) for label, prob in zip(gender_labels, gender_probs)
+            }
+            # Find most likely gender
+            predicted_gender = gender_labels[np.argmax(gender_probs)]
+            max_probability = float(np.max(gender_probs))
+            return {
+                'age': {
+                    'predicted_age': float(predicted_age)
+                },
+                'gender': {
+                    'predicted_gender': predicted_gender,
+                    'probabilities': gender_probabilities,
+                    'confidence': max_probability
+                }
+            }
+        except Exception as e:
+            raise Exception(f"Age & gender prediction error: {str(e)}")

models/nationality_model.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import os
+import torch
+import numpy as np
+from transformers import Wav2Vec2ForSequenceClassification, AutoFeatureExtractor
+# Constants
+MODEL_ID = "facebook/mms-lid-256"
+SAMPLING_RATE = 16000
+class NationalityModel:
+    def __init__(self, cache_dir="./cache/nationality"):
+        self.processor = None
+        self.model = None
+        self.cache_dir = cache_dir
+        os.makedirs(cache_dir, exist_ok=True)
+    def load(self):
+        try:
+            print(f"Loading nationality prediction model from {MODEL_ID}...")
+            self.processor = AutoFeatureExtractor.from_pretrained(MODEL_ID, cache_dir=self.cache_dir)
+            self.model = Wav2Vec2ForSequenceClassification.from_pretrained(MODEL_ID, cache_dir=self.cache_dir)
+            print("Nationality prediction model loaded successfully!")
+            return True
+        except Exception as e:
+            print(f"Error loading nationality prediction model: {e}")
+            return False
+    def predict(self, audio_data, sampling_rate):
+        if self.model is None or self.processor is None:
+            raise ValueError("Model not loaded. Call load() first.")
+        try:
+            # Ensure audio is properly formatted (float32, mono)
+            if len(audio_data.shape) > 1:
+                audio_data = audio_data.mean(axis=0)
+            audio_data = audio_data.astype(np.float32)
+            # Process audio with the feature extractor
+            inputs = self.processor(audio_data, sampling_rate=sampling_rate, return_tensors="pt")
+            # Get model predictions
+            with torch.no_grad():
+                outputs = self.model(**inputs).logits
+            # Get top 5 predictions
+            probabilities = torch.nn.functional.softmax(outputs, dim=-1)[0]
+            top_k_values, top_k_indices = torch.topk(probabilities, k=5)
+            # Convert to language codes and probabilities
+            top_languages = []
+            for i, idx in enumerate(top_k_indices):
+                lang_id = idx.item()
+                lang_code = self.model.config.id2label[lang_id]
+                probability = top_k_values[i].item()
+                top_languages.append({
+                    "language_code": lang_code,
+                    "probability": probability
+                })
+            # Get the most likely language
+            predicted_lang_id = torch.argmax(outputs, dim=-1)[0].item()
+            predicted_lang = self.model.config.id2label[predicted_lang_id]
+            max_probability = probabilities[predicted_lang_id].item()
+            return {
+                "predicted_language": predicted_lang,
+                "confidence": max_probability,
+                "top_languages": top_languages
+            }
+        except Exception as e:
+            raise Exception(f"Nationality prediction error: {str(e)}")

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+fastapi[all]
+uvicorn[standard]
+python-multipart
+audonnx
+audinterface
+librosa
+numpy
+audeer
+torch
+transformers
+torchaudio
+datasets