vitorcalvi commited on
Commit
8842208
·
1 Parent(s): 3c7e9f6

Add application file

Browse files
Files changed (3) hide show
  1. Dockerfile +21 -0
  2. app.py +136 -0
  3. requirements.txt +8 -0
Dockerfile ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Start from a standard Python base image
2
+ FROM python:3.9
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /code
6
+
7
+ # Copy the requirements file into the container
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install the Python dependencies
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Copy your application code into the container
14
+ COPY ./app.py /code/app.py
15
+
16
+ # Expose the port the app runs on
17
+ EXPOSE 8000
18
+
19
+ # The command to run your FastAPI app using uvicorn
20
+ # This will be run when the container starts
21
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]
app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI, File, UploadFile, HTTPException
2
+ from transformers import AutoModelForAudioClassification, AutoFeatureExtractor
3
+ import librosa
4
+ import torch
5
+ import numpy as np
6
+ import tempfile
7
+ import os
8
+ from functools import lru_cache
9
+
10
+ app = FastAPI(title="Speech Emotion Recognition API")
11
+
12
+ # Global variables for model caching
13
+ model = None
14
+ feature_extractor = None
15
+ id2label = None
16
+
17
+ @lru_cache(maxsize=1)
18
+ def load_model():
19
+ """Load model once and cache it for CPU optimization"""
20
+ global model, feature_extractor, id2label
21
+
22
+ model_id = "firdhokk/speech-emotion-recognition-with-openai-whisper-large-v3"
23
+
24
+ # Force CPU usage for free tier
25
+ device = "cpu"
26
+ torch.set_num_threads(2) # Optimize for free CPU
27
+
28
+ model = AutoModelForAudioClassification.from_pretrained(
29
+ model_id,
30
+ torch_dtype=torch.float32, # Use float32 for CPU
31
+ device_map="cpu"
32
+ )
33
+ feature_extractor = AutoFeatureExtractor.from_pretrained(
34
+ model_id,
35
+ do_normalize=True
36
+ )
37
+ id2label = model.config.id2label
38
+
39
+ return model, feature_extractor, id2label
40
+
41
+ def preprocess_audio(audio_path, feature_extractor, max_duration=30.0):
42
+ """Preprocess audio with memory optimization"""
43
+ audio_array, sampling_rate = librosa.load(
44
+ audio_path,
45
+ sr=feature_extractor.sampling_rate,
46
+ duration=max_duration # Limit duration for CPU efficiency
47
+ )
48
+
49
+ max_length = int(feature_extractor.sampling_rate * max_duration)
50
+ if len(audio_array) > max_length:
51
+ audio_array = audio_array[:max_length]
52
+ else:
53
+ audio_array = np.pad(audio_array, (0, max_length - len(audio_array)))
54
+
55
+ inputs = feature_extractor(
56
+ audio_array,
57
+ sampling_rate=feature_extractor.sampling_rate,
58
+ max_length=max_length,
59
+ truncation=True,
60
+ return_tensors="pt",
61
+ )
62
+ return inputs
63
+
64
+ @app.on_event("startup")
65
+ async def startup_event():
66
+ """Load model on startup"""
67
+ load_model()
68
+
69
+ @app.post("/predict-emotion")
70
+ async def predict_emotion(file: UploadFile = File(...)):
71
+ """Predict emotion from uploaded audio file"""
72
+ try:
73
+ # Validate file type
74
+ if not file.filename.lower().endswith(('.wav', '.mp3', '.m4a', '.flac')):
75
+ raise HTTPException(status_code=400, detail="Unsupported audio format")
76
+
77
+ # Save uploaded file temporarily
78
+ with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
79
+ content = await file.read()
80
+ tmp_file.write(content)
81
+ tmp_file_path = tmp_file.name
82
+
83
+ try:
84
+ # Load cached model
85
+ model, feature_extractor, id2label = load_model()
86
+
87
+ # Preprocess and predict
88
+ inputs = preprocess_audio(tmp_file_path, feature_extractor)
89
+
90
+ with torch.no_grad():
91
+ outputs = model(**inputs)
92
+ logits = outputs.logits
93
+ predicted_id = torch.argmax(logits, dim=-1).item()
94
+ predicted_label = id2label[predicted_id]
95
+
96
+ # Get confidence scores
97
+ probabilities = torch.softmax(logits, dim=-1)
98
+ confidence = probabilities[0][predicted_id].item()
99
+
100
+ return {
101
+ "predicted_emotion": predicted_label,
102
+ "confidence": round(confidence, 4),
103
+ "all_emotions": {
104
+ id2label[i]: round(probabilities[0][i].item(), 4)
105
+ for i in range(len(id2label))
106
+ }
107
+ }
108
+
109
+ finally:
110
+ # Clean up temporary file
111
+ os.unlink(tmp_file_path)
112
+
113
+ except Exception as e:
114
+ raise HTTPException(status_code=500, detail=f"Processing error: {str(e)}")
115
+
116
+ @app.get("/health")
117
+ async def health_check():
118
+ """Health check endpoint"""
119
+ return {"status": "healthy", "model_loaded": model is not None}
120
+
121
+ @app.get("/")
122
+ async def root():
123
+ """Root endpoint with API information"""
124
+ return {
125
+ "message": "Speech Emotion Recognition API",
126
+ "model": "Whisper Large V3",
127
+ "emotions": ["Angry", "Disgust", "Fearful", "Happy", "Neutral", "Sad", "Surprised"],
128
+ "endpoints": {
129
+ "predict": "/predict-emotion",
130
+ "health": "/health"
131
+ }
132
+ }
133
+
134
+ if __name__ == "__main__":
135
+ import uvicorn
136
+ uvicorn.run(app, host="0.0.0.0", port=8000)
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn[standard]
3
+ transformers
4
+ torch
5
+ librosa
6
+ numpy
7
+ python-multipart
8
+ accelerate