devcom33
commited on
Commit
·
6f46074
1
Parent(s):
08654c0
Updated FastAPI code
Browse files- app.py +154 -27
- config.py +10 -3
- models.py +51 -26
- requirements.txt +5 -1
- services.py +425 -13
- utils.py +96 -0
app.py
CHANGED
@@ -1,68 +1,161 @@
|
|
1 |
import logging
|
2 |
-
import
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
4 |
from pydantic import BaseModel
|
5 |
import config
|
6 |
from models import load_whisper, load_summarizer, load_spacy
|
7 |
-
from services import process_transcription, process_summary
|
8 |
-
|
|
|
|
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
|
12 |
app = FastAPI(
|
13 |
title="Transcription and Summarization API",
|
14 |
description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
|
15 |
-
version="1.0.0"
|
16 |
)
|
17 |
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
logger.info("Application starting up - loading models...")
|
19 |
whisper_model = load_whisper(config)
|
20 |
summarizer_pipeline = load_summarizer(config)
|
21 |
nlp_spacy = load_spacy(config)
|
22 |
logger.info("Model loading complete.")
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
if not whisper_model:
|
25 |
-
logger.critical(
|
|
|
|
|
26 |
if not summarizer_pipeline:
|
27 |
-
logger.critical(
|
|
|
|
|
28 |
if not nlp_spacy:
|
29 |
-
logger.warning(
|
30 |
-
|
|
|
31 |
|
32 |
class TranscriptInput(BaseModel):
|
33 |
transcript: str
|
34 |
-
|
35 |
|
36 |
@app.get("/health")
|
37 |
def health():
|
38 |
-
return {
|
|
|
39 |
"whisper_loaded": whisper_model is not None,
|
40 |
"summarizer_loaded": summarizer_pipeline is not None,
|
41 |
-
"spacy_loaded": nlp_spacy is not None
|
42 |
-
|
43 |
-
|
44 |
|
45 |
@app.post("/transcribe")
|
46 |
-
async def transcription(
|
|
|
|
|
|
|
47 |
if whisper_model is None:
|
48 |
-
|
49 |
|
50 |
try:
|
|
|
|
|
51 |
content = await audio_file.read()
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
except ValueError as ve:
|
56 |
-
|
57 |
-
|
58 |
except Exception as e:
|
59 |
-
logger.error(f"Unhandled error during transcription: {e}"
|
60 |
raise HTTPException(status_code=500, detail="Internal server error during transcription.")
|
61 |
|
62 |
-
|
63 |
@app.post("/summarize")
|
64 |
-
def summarize(input: TranscriptInput):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
|
|
|
|
|
66 |
if summarizer_pipeline is None:
|
67 |
raise HTTPException(status_code=503, detail="Summarization service unavailable.")
|
68 |
if not input.transcript:
|
@@ -71,11 +164,45 @@ def summarize(input: TranscriptInput):
|
|
71 |
try:
|
72 |
summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
|
73 |
return {"summary": summary}
|
74 |
-
|
75 |
except ValueError as ve:
|
76 |
logger.error(f"Value error during summary processing: {ve}")
|
77 |
raise HTTPException(status_code=400, detail=str(ve))
|
78 |
-
|
79 |
except Exception as e:
|
80 |
logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
|
81 |
-
raise HTTPException(status_code=500, detail="Internal server error during summarization.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import logging
|
2 |
+
import traceback
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import re
|
6 |
+
import time
|
7 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException, Form
|
8 |
+
from fastapi.middleware.cors import CORSMiddleware
|
9 |
from pydantic import BaseModel
|
10 |
import config
|
11 |
from models import load_whisper, load_summarizer, load_spacy
|
12 |
+
from services import process_transcription, process_summary, create_enhanced_summary_prompt, format_summary_to_markdown, get_language_name
|
13 |
+
from utils import webm_to_wav
|
14 |
+
import google.generativeai as genai
|
15 |
+
from google.api_core import exceptions as api_core_exceptions
|
16 |
|
17 |
logger = logging.getLogger(__name__)
|
18 |
|
19 |
app = FastAPI(
|
20 |
title="Transcription and Summarization API",
|
21 |
description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
|
22 |
+
version="1.0.0",
|
23 |
)
|
24 |
|
25 |
+
api_key = os.getenv("GEMINI_API_KEY")
|
26 |
+
if not api_key:
|
27 |
+
logger.critical("GEMINI_API_KEY environment variable not set.")
|
28 |
+
else:
|
29 |
+
genai.configure(api_key=api_key)
|
30 |
+
|
31 |
logger.info("Application starting up - loading models...")
|
32 |
whisper_model = load_whisper(config)
|
33 |
summarizer_pipeline = load_summarizer(config)
|
34 |
nlp_spacy = load_spacy(config)
|
35 |
logger.info("Model loading complete.")
|
36 |
|
37 |
+
origins = ["http://localhost:8080"]
|
38 |
+
|
39 |
+
app.add_middleware(
|
40 |
+
CORSMiddleware,
|
41 |
+
allow_origins=origins,
|
42 |
+
allow_credentials=True,
|
43 |
+
allow_methods=["*"],
|
44 |
+
allow_headers=["*"],
|
45 |
+
)
|
46 |
+
|
47 |
if not whisper_model:
|
48 |
+
logger.critical(
|
49 |
+
"Whisper model failed to load. Transcription endpoint will be unavailable."
|
50 |
+
)
|
51 |
if not summarizer_pipeline:
|
52 |
+
logger.critical(
|
53 |
+
"Summarizer pipeline failed to load. Summarization endpoint will be unavailable."
|
54 |
+
)
|
55 |
if not nlp_spacy:
|
56 |
+
logger.warning(
|
57 |
+
"SpaCy model failed to load. Summarization will proceed without spaCy preprocessing."
|
58 |
+
)
|
59 |
|
60 |
class TranscriptInput(BaseModel):
|
61 |
transcript: str
|
62 |
+
language: str = "en"
|
63 |
|
64 |
@app.get("/health")
|
65 |
def health():
|
66 |
+
return {
|
67 |
+
"status": "ok",
|
68 |
"whisper_loaded": whisper_model is not None,
|
69 |
"summarizer_loaded": summarizer_pipeline is not None,
|
70 |
+
"spacy_loaded": nlp_spacy is not None,
|
71 |
+
}
|
|
|
72 |
|
73 |
@app.post("/transcribe")
|
74 |
+
async def transcription(
|
75 |
+
audio_file: UploadFile = File(...),
|
76 |
+
enable_diarization: bool = Form(False)
|
77 |
+
):
|
78 |
if whisper_model is None:
|
79 |
+
raise HTTPException(status_code=503, detail="Transcription service unavailable.")
|
80 |
|
81 |
try:
|
82 |
+
start_time = time.time()
|
83 |
+
content_type = audio_file.content_type
|
84 |
content = await audio_file.read()
|
85 |
+
|
86 |
+
if content_type in ["audio/webm", "video/webm"]:
|
87 |
+
wav_path = webm_to_wav(content)
|
88 |
+
with open(wav_path, "rb") as f:
|
89 |
+
wav_bytes = f.read()
|
90 |
+
os.remove(wav_path)
|
91 |
+
elif content_type == "audio/wav":
|
92 |
+
wav_bytes = content
|
93 |
+
else:
|
94 |
+
raise HTTPException(status_code=400, detail="Unsupported audio format. Use .webm or .wav")
|
95 |
+
|
96 |
+
transcript, info, diarized_segments = process_transcription(
|
97 |
+
wav_bytes,
|
98 |
+
whisper_model,
|
99 |
+
enable_diarization=enable_diarization
|
100 |
+
)
|
101 |
+
|
102 |
+
processing_time = time.time() - start_time
|
103 |
+
logger.info(f"Transcription successful. Language: {info.language}, Time: {processing_time:.2f}s")
|
104 |
+
|
105 |
+
speakers = []
|
106 |
+
if diarized_segments:
|
107 |
+
for segment in diarized_segments:
|
108 |
+
if segment["speaker"] not in speakers:
|
109 |
+
speakers.append(segment["speaker"])
|
110 |
+
|
111 |
+
response = {
|
112 |
+
"transcript": transcript,
|
113 |
+
"language": info.language,
|
114 |
+
"duration": info.duration,
|
115 |
+
}
|
116 |
+
|
117 |
+
if enable_diarization and diarized_segments:
|
118 |
+
response["speakers"] = speakers
|
119 |
+
response["segments"] = diarized_segments
|
120 |
+
|
121 |
+
return response
|
122 |
+
except HTTPException as http_exc:
|
123 |
+
raise http_exc
|
124 |
except ValueError as ve:
|
125 |
+
logger.error(f"Value error during transcription processing: {ve}")
|
126 |
+
raise HTTPException(status_code=400, detail=str(ve))
|
127 |
except Exception as e:
|
128 |
+
logger.error(f"Unhandled error during transcription: {e}\n{traceback.format_exc()}")
|
129 |
raise HTTPException(status_code=500, detail="Internal server error during transcription.")
|
130 |
|
|
|
131 |
@app.post("/summarize")
|
132 |
+
async def summarize(input: TranscriptInput):
|
133 |
+
if not input.transcript or not input.transcript.strip():
|
134 |
+
raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
|
135 |
+
try:
|
136 |
+
prompt = f"""
|
137 |
+
Summarize the following text concisely:
|
138 |
+
Transcript:
|
139 |
+
\"\"\"
|
140 |
+
{input.transcript}
|
141 |
+
\"\"\"
|
142 |
+
"""
|
143 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
144 |
+
response = model.generate_content(prompt)
|
145 |
+
logger.info(f"Gemini /summarize response text: '{response.text}'")
|
146 |
+
return {"summary": response.text}
|
147 |
+
except api_core_exceptions.ResourceExhausted as e:
|
148 |
+
logger.error(f"Gemini API rate limit exceeded: {e}")
|
149 |
+
raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
|
150 |
+
except genai.types.BlockedPromptError as e:
|
151 |
+
logger.error(f"The prompt was blocked: {e}")
|
152 |
+
raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
|
153 |
+
except Exception as e:
|
154 |
+
logger.error(f"An unexpected error occurred during basic summarization: {e}", exc_info=True)
|
155 |
+
raise HTTPException(status_code=500, detail=str(e))
|
156 |
|
157 |
+
@app.post("/smart-summary")
|
158 |
+
def smart_summarize(input: TranscriptInput):
|
159 |
if summarizer_pipeline is None:
|
160 |
raise HTTPException(status_code=503, detail="Summarization service unavailable.")
|
161 |
if not input.transcript:
|
|
|
164 |
try:
|
165 |
summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
|
166 |
return {"summary": summary}
|
|
|
167 |
except ValueError as ve:
|
168 |
logger.error(f"Value error during summary processing: {ve}")
|
169 |
raise HTTPException(status_code=400, detail=str(ve))
|
|
|
170 |
except Exception as e:
|
171 |
logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
|
172 |
+
raise HTTPException(status_code=500, detail="Internal server error during summarization.")
|
173 |
+
|
174 |
+
@app.post("/enhanced-summary")
|
175 |
+
async def enhanced_summary(input: TranscriptInput):
|
176 |
+
if not input.transcript or not input.transcript.strip():
|
177 |
+
raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
|
178 |
+
|
179 |
+
try:
|
180 |
+
language_name = get_language_name(input.language)
|
181 |
+
prompt = create_enhanced_summary_prompt(input.transcript, language_name)
|
182 |
+
model = genai.GenerativeModel('gemini-1.5-flash')
|
183 |
+
|
184 |
+
response = model.generate_content(
|
185 |
+
contents=prompt,
|
186 |
+
generation_config=genai.GenerationConfig(response_mime_type="application/json")
|
187 |
+
)
|
188 |
+
|
189 |
+
try:
|
190 |
+
cleaned_text = re.sub(r"```json\s*(.*)\s*```", r"\1", response.text, flags=re.DOTALL)
|
191 |
+
summary_json = json.loads(cleaned_text)
|
192 |
+
logger.info(f"Received JSON from Gemini: {summary_json}")
|
193 |
+
except (json.JSONDecodeError, TypeError) as e:
|
194 |
+
logger.error(f"Failed to parse LLM response as JSON: {e}\nResponse text: {response.text}")
|
195 |
+
raise HTTPException(status_code=500, detail="Failed to generate a structured summary due to an invalid model response.")
|
196 |
+
|
197 |
+
formatted_markdown = format_summary_to_markdown(summary_json)
|
198 |
+
logger.info(f"Formatted Markdown: {formatted_markdown}")
|
199 |
+
return {"summary": formatted_markdown}
|
200 |
+
except api_core_exceptions.ResourceExhausted as e:
|
201 |
+
logger.error(f"Gemini API rate limit exceeded: {e}")
|
202 |
+
raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
|
203 |
+
except genai.types.BlockedPromptError as e:
|
204 |
+
logger.error(f"The prompt was blocked: {e}")
|
205 |
+
raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
|
206 |
+
except Exception as e:
|
207 |
+
logger.error(f"An unexpected error occurred during summarization: {e}", exc_info=True)
|
208 |
+
raise HTTPException(status_code=500, detail="An internal server error occurred during summarization.")
|
config.py
CHANGED
@@ -1,15 +1,22 @@
|
|
1 |
-
import os
|
2 |
import psutil
|
|
|
|
|
|
|
|
|
3 |
|
4 |
-
|
|
|
5 |
WHISPER_DEVICE = "cpu"
|
6 |
WHISPER_COMPUTE_TYPE = "int8"
|
7 |
-
|
8 |
PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
|
9 |
|
|
|
10 |
SUMMARIZER_MODEL = "facebook/bart-large-cnn"
|
11 |
SUMMARIZER_MAX_LENGTH = 150
|
12 |
SUMMARIZER_MIN_LENGTH = 50
|
13 |
|
14 |
SPACY_MODEL = "en_core_web_sm"
|
15 |
CPU_THREADS = max(1, psutil.cpu_count(logical=False))
|
|
|
|
|
|
|
|
|
|
1 |
import psutil
|
2 |
+
import os
|
3 |
+
from dotenv import load_dotenv
|
4 |
+
import nltk
|
5 |
+
|
6 |
|
7 |
+
load_dotenv()
|
8 |
+
WHISPER_MODEL_NAME = "tiny"
|
9 |
WHISPER_DEVICE = "cpu"
|
10 |
WHISPER_COMPUTE_TYPE = "int8"
|
|
|
11 |
PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
|
12 |
|
13 |
+
#SUMMARIZER_MODEL = "google/flan-t5-base"
|
14 |
SUMMARIZER_MODEL = "facebook/bart-large-cnn"
|
15 |
SUMMARIZER_MAX_LENGTH = 150
|
16 |
SUMMARIZER_MIN_LENGTH = 50
|
17 |
|
18 |
SPACY_MODEL = "en_core_web_sm"
|
19 |
CPU_THREADS = max(1, psutil.cpu_count(logical=False))
|
20 |
+
|
21 |
+
if not PYANNOTE_AUTH_TOKEN:
|
22 |
+
raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")
|
models.py
CHANGED
@@ -1,61 +1,84 @@
|
|
1 |
-
import os
|
2 |
-
os.environ['HF_HOME'] = '/tmp/huggingface'
|
3 |
-
os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
|
4 |
-
|
5 |
-
os.environ['HUGGINGFACE_HUB_CACHE'] = '/tmp/huggingface'
|
6 |
-
|
7 |
-
from huggingface_hub import snapshot_download
|
8 |
-
import spacy
|
9 |
import logging
|
10 |
from faster_whisper import WhisperModel
|
|
|
11 |
from transformers import pipeline
|
|
|
|
|
|
|
|
|
12 |
|
13 |
logger = logging.getLogger(__name__)
|
|
|
|
|
14 |
|
15 |
def load_whisper(config):
|
16 |
logger.info("Loading Whisper model...")
|
17 |
-
|
18 |
try:
|
19 |
-
cache_dir = "/tmp/hf-cache"
|
20 |
-
os.makedirs(cache_dir, exist_ok=True)
|
21 |
-
|
22 |
-
model_dir = snapshot_download(
|
23 |
-
repo_id=config.WHISPER_MODEL_NAME,
|
24 |
-
cache_dir=cache_dir,
|
25 |
-
token=os.getenv("HUGGINGFACE_API_KEY")
|
26 |
-
)
|
27 |
-
|
28 |
model = WhisperModel(
|
29 |
-
|
30 |
device=config.WHISPER_DEVICE,
|
31 |
compute_type=config.WHISPER_COMPUTE_TYPE,
|
32 |
-
cpu_threads=config.CPU_THREADS
|
|
|
|
|
|
|
33 |
)
|
34 |
|
35 |
-
logger.info(f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded from {model_dir} on {config.WHISPER_DEVICE}.")
|
36 |
return model
|
37 |
-
|
38 |
except Exception as e:
|
39 |
logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
|
40 |
return None
|
41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
def load_summarizer(config):
|
43 |
logger.info("Loading Summarization pipeline...")
|
44 |
try:
|
45 |
-
summarizer = pipeline(
|
|
|
|
|
|
|
|
|
46 |
logger.info("Summarization pipeline loaded.")
|
47 |
return summarizer
|
48 |
except Exception as e:
|
49 |
logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
|
50 |
return None
|
51 |
|
|
|
52 |
def load_spacy(config):
|
53 |
logger.info("Loading spaCy model...")
|
54 |
|
55 |
try:
|
56 |
nlp = spacy.load("en_core_web_sm")
|
57 |
logger.info("spaCy model 'en_core_web_sm' loaded.")
|
58 |
-
|
59 |
return nlp
|
60 |
|
61 |
except OSError:
|
@@ -66,9 +89,11 @@ def load_spacy(config):
|
|
66 |
nlp = spacy.load("en_core_web_sm")
|
67 |
logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
|
68 |
return nlp
|
69 |
-
|
70 |
except Exception as download_e:
|
71 |
-
logger.error(
|
|
|
|
|
72 |
return None
|
73 |
|
74 |
except Exception as e:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import logging
|
2 |
from faster_whisper import WhisperModel
|
3 |
+
import spacy
|
4 |
from transformers import pipeline
|
5 |
+
import os
|
6 |
+
import torch
|
7 |
+
from pyannote.audio import Pipeline
|
8 |
+
|
9 |
|
10 |
logger = logging.getLogger(__name__)
|
11 |
+
_diarize_model = None
|
12 |
+
|
13 |
|
14 |
def load_whisper(config):
|
15 |
logger.info("Loading Whisper model...")
|
|
|
16 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
model = WhisperModel(
|
18 |
+
config.WHISPER_MODEL_NAME,
|
19 |
device=config.WHISPER_DEVICE,
|
20 |
compute_type=config.WHISPER_COMPUTE_TYPE,
|
21 |
+
cpu_threads=config.CPU_THREADS,
|
22 |
+
)
|
23 |
+
logger.info(
|
24 |
+
f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded on {config.WHISPER_DEVICE}."
|
25 |
)
|
26 |
|
|
|
27 |
return model
|
|
|
28 |
except Exception as e:
|
29 |
logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
|
30 |
return None
|
31 |
|
32 |
+
|
33 |
+
def load_diarization(config):
|
34 |
+
global _diarize_model
|
35 |
+
logger.info("Loading PYANNOTE model...")
|
36 |
+
|
37 |
+
if _diarize_model is None and hasattr(config, "PYANNOTE_AUTH_TOKEN"):
|
38 |
+
try:
|
39 |
+
logger.info("Loading diarization model")
|
40 |
+
_diarize_model = Pipeline.from_pretrained(
|
41 |
+
"pyannote/speaker-diarization-3.0",
|
42 |
+
use_auth_token=config.PYANNOTE_AUTH_TOKEN,
|
43 |
+
)
|
44 |
+
|
45 |
+
# Move to GPU if available
|
46 |
+
if (
|
47 |
+
hasattr(config, "WHISPER_DEVICE")
|
48 |
+
and config.WHISPER_DEVICE == "cuda"
|
49 |
+
and torch.cuda.is_available()
|
50 |
+
):
|
51 |
+
_diarize_model = _diarize_model.to(torch.device("cuda"))
|
52 |
+
|
53 |
+
logger.info("Diarization model loaded successfully")
|
54 |
+
except Exception as e:
|
55 |
+
logger.error(f"Failed to load diarization model: {e}", exc_info=True)
|
56 |
+
|
57 |
+
return _diarize_model
|
58 |
+
|
59 |
+
|
60 |
def load_summarizer(config):
|
61 |
logger.info("Loading Summarization pipeline...")
|
62 |
try:
|
63 |
+
summarizer = pipeline(
|
64 |
+
"text2text-generation",
|
65 |
+
model=config.SUMMARIZER_MODEL,
|
66 |
+
device=0 if torch.cuda.is_available() else -1,
|
67 |
+
)
|
68 |
logger.info("Summarization pipeline loaded.")
|
69 |
return summarizer
|
70 |
except Exception as e:
|
71 |
logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
|
72 |
return None
|
73 |
|
74 |
+
|
75 |
def load_spacy(config):
|
76 |
logger.info("Loading spaCy model...")
|
77 |
|
78 |
try:
|
79 |
nlp = spacy.load("en_core_web_sm")
|
80 |
logger.info("spaCy model 'en_core_web_sm' loaded.")
|
81 |
+
|
82 |
return nlp
|
83 |
|
84 |
except OSError:
|
|
|
89 |
nlp = spacy.load("en_core_web_sm")
|
90 |
logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
|
91 |
return nlp
|
92 |
+
|
93 |
except Exception as download_e:
|
94 |
+
logger.error(
|
95 |
+
f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}"
|
96 |
+
)
|
97 |
return None
|
98 |
|
99 |
except Exception as e:
|
requirements.txt
CHANGED
@@ -7,4 +7,8 @@ spacy
|
|
7 |
pydub
|
8 |
psutil
|
9 |
python-multipart
|
10 |
-
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
|
|
|
|
|
|
|
|
|
7 |
pydub
|
8 |
psutil
|
9 |
python-multipart
|
10 |
+
en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
|
11 |
+
pyannote
|
12 |
+
nltk
|
13 |
+
google.generativeai
|
14 |
+
google
|
services.py
CHANGED
@@ -1,10 +1,30 @@
|
|
1 |
import logging
|
2 |
import os
|
3 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
|
5 |
logger = logging.getLogger(__name__)
|
6 |
|
7 |
-
|
|
|
|
|
|
|
8 |
if not whisper_model:
|
9 |
raise ValueError("Whisper model not loaded.")
|
10 |
|
@@ -14,9 +34,123 @@ def process_transcription(audio_content: bytes, whisper_model):
|
|
14 |
temp_file_path = temp_file.name
|
15 |
temp_file.write(audio_content)
|
16 |
|
17 |
-
|
|
|
|
|
|
|
18 |
transcript = " ".join([seg.text.strip() for seg in segments])
|
19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
finally:
|
21 |
if temp_file_path and os.path.exists(temp_file_path):
|
22 |
os.remove(temp_file_path)
|
@@ -25,21 +159,299 @@ def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
|
|
25 |
if not summarizer_pipeline:
|
26 |
raise ValueError("Summarizer model not loaded.")
|
27 |
|
28 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
if nlp_spacy:
|
30 |
try:
|
31 |
-
doc = nlp_spacy(
|
32 |
sentences = [sent.text.strip() for sent in doc.sents]
|
33 |
processed_text = " ".join(sentences)
|
34 |
except Exception as e:
|
35 |
logger.error(f"SpaCy processing failed: {e}", exc_info=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
|
44 |
-
|
45 |
-
return final_summary
|
|
|
1 |
import logging
|
2 |
import os
|
3 |
import tempfile
|
4 |
+
import nltk
|
5 |
+
from nltk.tokenize import sent_tokenize
|
6 |
+
from nltk.tokenize.punkt import PunktSentenceTokenizer
|
7 |
+
from nltk.data import load
|
8 |
+
import pickle
|
9 |
+
import re
|
10 |
+
from utils import clean_transcript, consolidate_similar_items, chunk_text
|
11 |
+
from transformers import pipeline
|
12 |
+
import config
|
13 |
+
from models import load_diarization
|
14 |
+
import wave
|
15 |
+
import gc
|
16 |
+
import torch
|
17 |
+
import time
|
18 |
+
import pycountry
|
19 |
+
from functools import lru_cache
|
20 |
+
|
21 |
|
22 |
logger = logging.getLogger(__name__)
|
23 |
|
24 |
+
_diarize_model = None
|
25 |
+
|
26 |
+
def process_transcription(audio_content: bytes, whisper_model, enable_diarization=False):
|
27 |
+
start = time.time()
|
28 |
if not whisper_model:
|
29 |
raise ValueError("Whisper model not loaded.")
|
30 |
|
|
|
34 |
temp_file_path = temp_file.name
|
35 |
temp_file.write(audio_content)
|
36 |
|
37 |
+
segments_gen, info = whisper_model.transcribe(temp_file_path, beam_size=5)
|
38 |
+
|
39 |
+
segments = list(segments_gen)
|
40 |
+
|
41 |
transcript = " ".join([seg.text.strip() for seg in segments])
|
42 |
+
|
43 |
+
|
44 |
+
global _diarize_model
|
45 |
+
|
46 |
+
if not enable_diarization:
|
47 |
+
return transcript, info, None
|
48 |
+
|
49 |
+
if _diarize_model is None:
|
50 |
+
_diarize_model = load_diarization(config)
|
51 |
+
|
52 |
+
if _diarize_model is None:
|
53 |
+
logger.warning("Diarization model not available, returning transcript without speakers")
|
54 |
+
return transcript, info, None
|
55 |
+
|
56 |
+
with wave.open(temp_file_path, 'rb') as wav:
|
57 |
+
frames = wav.getnframes()
|
58 |
+
rate = wav.getframerate()
|
59 |
+
#calcul audio duration
|
60 |
+
audio_duration = frames / float(rate)
|
61 |
+
|
62 |
+
|
63 |
+
if audio_duration < 3.0:
|
64 |
+
logger.info(f"Audio too short ({audio_duration:.2f}s), skipping diarization")
|
65 |
+
diarized_segments = [{"speaker": "SPEAKER_0", "text": transcript}]
|
66 |
+
diarized_transcript = f"[SPEAKER_0]: {transcript}"
|
67 |
+
return diarized_transcript, info, diarized_segments
|
68 |
+
|
69 |
+
|
70 |
+
logger.info("Running speaker diarization")
|
71 |
+
diarization = _diarize_model(temp_file_path)
|
72 |
+
|
73 |
+
# Extract diarization segments
|
74 |
+
diarize_segments = []
|
75 |
+
for turn, _, speaker in diarization.itertracks(yield_label=True):
|
76 |
+
diarize_segments.append({
|
77 |
+
"speaker": f"SPEAKER_{speaker.replace('SPEAKER_', '')}",
|
78 |
+
"start": turn.start,
|
79 |
+
"end": turn.end
|
80 |
+
})
|
81 |
+
|
82 |
+
diarized_segments = []
|
83 |
+
|
84 |
+
for segment in segments:
|
85 |
+
# Find best matching speaker based on time overlap
|
86 |
+
best_speaker = None
|
87 |
+
max_overlap = 0
|
88 |
+
seg_start = segment.start
|
89 |
+
seg_end = segment.end
|
90 |
+
|
91 |
+
for diar_seg in diarize_segments:
|
92 |
+
diar_start = diar_seg["start"]
|
93 |
+
diar_end = diar_seg["end"]
|
94 |
+
# Calculate overlap
|
95 |
+
overlap_start = max(seg_start, diar_start)
|
96 |
+
overlap_end = min(seg_end, diar_end)
|
97 |
+
|
98 |
+
if overlap_end > overlap_start:
|
99 |
+
overlap = overlap_end - overlap_start
|
100 |
+
if overlap > max_overlap:
|
101 |
+
max_overlap = overlap
|
102 |
+
best_speaker = diar_seg["speaker"]
|
103 |
+
|
104 |
+
# If no overlap found, assign to the closest speaker
|
105 |
+
if best_speaker is None:
|
106 |
+
min_distance = float('inf')
|
107 |
+
for diar_seg in diarize_segments:
|
108 |
+
# Distance to start of segment
|
109 |
+
dist_start = abs(seg_start - diar_seg["start"])
|
110 |
+
# Distance to end of segment
|
111 |
+
dist_end = abs(seg_end - diar_seg["end"])
|
112 |
+
# Take the minimum
|
113 |
+
dist = min(dist_start, dist_end)
|
114 |
+
|
115 |
+
if dist < min_distance:
|
116 |
+
min_distance = dist
|
117 |
+
best_speaker = diar_seg["speaker"]
|
118 |
+
|
119 |
+
diarized_segments.append({
|
120 |
+
"speaker": best_speaker or "SPEAKER_UNKNOWN",
|
121 |
+
"text": segment.text,
|
122 |
+
"start": segment.start,
|
123 |
+
"end": segment.end
|
124 |
+
})
|
125 |
+
|
126 |
+
# Format diarized transcript
|
127 |
+
diarized_transcript = ""
|
128 |
+
current_speaker = None
|
129 |
+
|
130 |
+
for segment in diarized_segments:
|
131 |
+
speaker = segment["speaker"]
|
132 |
+
text = segment["text"].strip()
|
133 |
+
|
134 |
+
if not text:
|
135 |
+
continue
|
136 |
+
|
137 |
+
if speaker != current_speaker:
|
138 |
+
diarized_transcript += f"\n[{speaker}]: {text}"
|
139 |
+
current_speaker = speaker
|
140 |
+
else:
|
141 |
+
diarized_transcript += f" {text}"
|
142 |
+
|
143 |
+
# Clean up memory
|
144 |
+
gc.collect()
|
145 |
+
if torch.cuda.is_available():
|
146 |
+
torch.cuda.empty_cache()
|
147 |
+
|
148 |
+
end = time.time()
|
149 |
+
|
150 |
+
logger.info("time : ", (end - start) * 10**3)
|
151 |
+
|
152 |
+
return diarized_transcript, info, diarized_segments
|
153 |
+
|
154 |
finally:
|
155 |
if temp_file_path and os.path.exists(temp_file_path):
|
156 |
os.remove(temp_file_path)
|
|
|
159 |
if not summarizer_pipeline:
|
160 |
raise ValueError("Summarizer model not loaded.")
|
161 |
|
162 |
+
#clean transcript
|
163 |
+
cleaned_transcript = clean_transcript(text)
|
164 |
+
|
165 |
+
processed_text = cleaned_transcript
|
166 |
+
|
167 |
+
doc = None
|
168 |
+
|
169 |
if nlp_spacy:
|
170 |
try:
|
171 |
+
doc = nlp_spacy(processed_text)
|
172 |
sentences = [sent.text.strip() for sent in doc.sents]
|
173 |
processed_text = " ".join(sentences)
|
174 |
except Exception as e:
|
175 |
logger.error(f"SpaCy processing failed: {e}", exc_info=True)
|
176 |
+
|
177 |
+
categories = {
|
178 |
+
"meeting_title": [],
|
179 |
+
"intro": [],
|
180 |
+
"topics": [],
|
181 |
+
"decisions": [],
|
182 |
+
"action_items": [],
|
183 |
+
"questions": [],
|
184 |
+
"deadlines": [],
|
185 |
+
"participants": [],
|
186 |
+
"overall_summary": [],
|
187 |
+
"conclusion": []
|
188 |
+
}
|
189 |
+
|
190 |
+
# extraction meeting title
|
191 |
+
title_pattern = r'(meeting|call|session|discussion) (about|on|for|regarding) ([^.]+)'
|
192 |
+
title_matches = re.findall(title_pattern, processed_text, re.IGNORECASE)
|
193 |
+
|
194 |
+
if title_matches:
|
195 |
+
categories["meeting_title"].append(title_matches[0][2].strip())
|
196 |
+
|
197 |
+
if doc:
|
198 |
+
sentences = [sent.text.strip() for sent in doc.sents]
|
199 |
+
else:
|
200 |
+
try:
|
201 |
+
with open("/home/heymouad/nltk_data/tokenizers/punkt/english.pickle", "rb") as f:
|
202 |
+
tokenizer = pickle.load(f)
|
203 |
+
sentences = tokenizer.tokenize(processed_text)
|
204 |
+
except Exception as e:
|
205 |
+
logger.error(f"NLTK tokenization failed: {e}", exc_info=True)
|
206 |
+
sentences = sent_tokenize(processed_text)
|
207 |
+
|
208 |
+
# Find participants
|
209 |
+
people = set()
|
210 |
+
if doc:
|
211 |
+
for ent in doc.ents:
|
212 |
+
if ent.label_ == "PERSON":
|
213 |
+
person = ent.text.strip()
|
214 |
+
if len(person) > 2:
|
215 |
+
people.add(person)
|
216 |
+
|
217 |
+
if people:
|
218 |
+
categories["participants"] = list(people)
|
219 |
+
|
220 |
+
try:
|
221 |
+
# chunked the text because of limits of bart model
|
222 |
+
logger.info(processed_text[::100])
|
223 |
+
processed_text = chunk_text(processed_text)
|
224 |
+
parts_summaries = []
|
225 |
+
|
226 |
+
for chunk in processed_text:
|
227 |
+
result = summarizer_pipeline(chunk, max_length=150, min_length=30, do_sample=False)
|
228 |
+
if result and isinstance(result, list) and len(result) > 0:
|
229 |
+
part_summary = result[0].get('summary_text', '')
|
230 |
+
if part_summary:
|
231 |
+
parts_summaries.append(part_summary)
|
232 |
+
|
233 |
+
overall_summary = " ".join(parts_summaries)
|
234 |
+
overall_summary = summarizer_pipeline(overall_summary, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
|
235 |
+
categories["overall_summary"] = [overall_summary]
|
236 |
+
|
237 |
+
except Exception as e:
|
238 |
+
logger.error(f"Summarization failed: {e}", exc_info=True)
|
239 |
+
categories["overall_summary"] = ["Failed to generate overall summary."]
|
240 |
+
|
241 |
+
# Process each sentence
|
242 |
+
for i, sentence in enumerate(sentences):
|
243 |
+
sentence = sentence.strip()
|
244 |
+
if not sentence:
|
245 |
+
continue
|
246 |
+
|
247 |
+
# Check for action items
|
248 |
+
if (re.search(r'(need to|will|shall|must|should|have to|assigned to|responsible for|task|action item|to-do|follow up|take care of)',
|
249 |
+
sentence, re.IGNORECASE) and
|
250 |
+
re.search(r'(we|you|I|they|he|she|team|group|department)', sentence, re.IGNORECASE)):
|
251 |
+
|
252 |
+
categories["action_items"].append(sentence)
|
253 |
+
continue
|
254 |
+
|
255 |
+
# Check for decisions
|
256 |
+
if re.search(r'(decided|agreed|conclusion|resolved|approved|rejected|consensus|finalized|confirmed|determined)',
|
257 |
+
sentence, re.IGNORECASE):
|
258 |
+
categories["decisions"].append(sentence)
|
259 |
+
continue
|
260 |
+
|
261 |
+
# Check for deadlines/timing with stronger patterns
|
262 |
+
if re.search(r'(by|due|deadline|schedule|date|tomorrow|next week|month|calendar|remind|upcoming|on|at|until)',
|
263 |
+
sentence, re.IGNORECASE) and re.search(r'(time|day|week|month|year|hour|minute)', sentence, re.IGNORECASE):
|
264 |
+
categories["deadlines"].append(sentence)
|
265 |
+
continue
|
266 |
+
|
267 |
+
# Check for questions/issues
|
268 |
+
if (re.search(r'(\?|issue|problem|concern|question|clarif|wonder|how|what|when|where|why|who)',
|
269 |
+
sentence, re.IGNORECASE) and
|
270 |
+
not re.search(r'(answer|answered|resolved|solved)', sentence, re.IGNORECASE)):
|
271 |
+
categories["questions"].append(sentence)
|
272 |
+
continue
|
273 |
+
|
274 |
+
# Check for intro statements
|
275 |
+
if i < len(sentences) // 10: # First 10% of sentences
|
276 |
+
if re.search(r'(welcome|begin|start|agenda|today|discuss|meeting|introduce|opening|good morning|hello|topic)',
|
277 |
+
sentence, re.IGNORECASE):
|
278 |
+
categories["intro"].append(sentence)
|
279 |
+
continue
|
280 |
+
|
281 |
+
# Check for conclusion statements
|
282 |
+
if i > len(sentences) * 9 // 10: # Last 10% of sentences
|
283 |
+
if re.search(r'(conclude|end|wrap|summary|thank|next meeting|follow up|adjourn|goodbye|bye|closing)',
|
284 |
+
sentence, re.IGNORECASE):
|
285 |
+
categories["conclusion"].append(sentence)
|
286 |
+
continue
|
287 |
+
|
288 |
+
# Everything else is considered a topic if it has substance
|
289 |
+
if len(sentence.split()) > 3: # Avoid very short sentences
|
290 |
+
categories["topics"].append(sentence)
|
291 |
+
|
292 |
+
# Process categories to avoid repetition and consolidate related points
|
293 |
+
for category in categories:
|
294 |
+
if category in ["topics", "action_items", "decisions", "questions", "deadlines"]:
|
295 |
+
categories[category] = consolidate_similar_items(categories[category])
|
296 |
+
|
297 |
+
# Limit the number of topics to avoid overwhelming
|
298 |
+
if len(categories["topics"]) > 10:
|
299 |
+
# If we have a summarizer, try to generate a summary of topics
|
300 |
+
try:
|
301 |
+
topics_text = " ".join(categories["topics"])
|
302 |
+
topics_summary = summarizer_pipeline(topics_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
|
303 |
+
categories["topics"] = sent_tokenize(topics_summary)
|
304 |
+
except Exception as e:
|
305 |
+
logger.error(f"Topics summarization failed: {e}", exc_info=True)
|
306 |
+
# Otherwise just take the first few and last few topics
|
307 |
+
categories["topics"] = categories["topics"][:5] + categories["topics"][-5:]
|
308 |
+
|
309 |
+
# Add emojis to formatted output
|
310 |
+
formatted_summary = []
|
311 |
+
|
312 |
+
# Format meeting title if available
|
313 |
+
if categories.get("meeting_title"):
|
314 |
+
formatted_summary.append(f"📝 **Meeting Title:** {categories['meeting_title'][0]}")
|
315 |
+
formatted_summary.append("")
|
316 |
+
|
317 |
+
# Add overall summary
|
318 |
+
if categories.get("overall_summary"):
|
319 |
+
formatted_summary.append("📋 **Executive Summary:**")
|
320 |
+
formatted_summary.append(categories["overall_summary"][0])
|
321 |
+
formatted_summary.append("")
|
322 |
+
|
323 |
+
# Format participants
|
324 |
+
if categories["participants"]:
|
325 |
+
formatted_summary.append("👥 **Participants:**")
|
326 |
+
formatted_summary.append(", ".join(categories["participants"]))
|
327 |
+
formatted_summary.append("")
|
328 |
+
|
329 |
+
# Format intro
|
330 |
+
if categories["intro"]:
|
331 |
+
formatted_summary.append("🎯 **Meeting Introduction:**")
|
332 |
+
formatted_summary.append(" ".join(categories["intro"]))
|
333 |
+
formatted_summary.append("")
|
334 |
+
|
335 |
+
# Format main topics
|
336 |
+
if categories["topics"]:
|
337 |
+
formatted_summary.append("💡 **Key Topics:**")
|
338 |
+
for i, topic in enumerate(categories["topics"], 1):
|
339 |
+
formatted_summary.append(f"{i}. {topic}")
|
340 |
+
formatted_summary.append("")
|
341 |
+
|
342 |
+
# Format decisions
|
343 |
+
if categories["decisions"]:
|
344 |
+
formatted_summary.append("✅ **Decisions Made:**")
|
345 |
+
for decision in categories["decisions"]:
|
346 |
+
formatted_summary.append(f"• {decision}")
|
347 |
+
formatted_summary.append("")
|
348 |
+
|
349 |
+
# Format action items
|
350 |
+
if categories["action_items"]:
|
351 |
+
formatted_summary.append("📋 **Action Items:**")
|
352 |
+
for item in categories["action_items"]:
|
353 |
+
formatted_summary.append(f"• {item}")
|
354 |
+
formatted_summary.append("")
|
355 |
+
|
356 |
+
# Format questions
|
357 |
+
if categories["questions"]:
|
358 |
+
formatted_summary.append("❓ **Questions & Concerns:**")
|
359 |
+
for question in categories["questions"]:
|
360 |
+
formatted_summary.append(f"• {question}")
|
361 |
+
formatted_summary.append("")
|
362 |
+
|
363 |
+
# Format deadlines
|
364 |
+
if categories["deadlines"]:
|
365 |
+
formatted_summary.append("⏰ **Deadlines & Timing:**")
|
366 |
+
for deadline in categories["deadlines"]:
|
367 |
+
formatted_summary.append(f"• {deadline}")
|
368 |
+
formatted_summary.append("")
|
369 |
+
|
370 |
+
# Format conclusion
|
371 |
+
if categories["conclusion"]:
|
372 |
+
formatted_summary.append("🏁 **Conclusion:**")
|
373 |
+
formatted_summary.append(" ".join(categories["conclusion"]))
|
374 |
+
|
375 |
+
return "\n".join(formatted_summary)
|
376 |
+
|
377 |
+
|
378 |
+
def create_enhanced_summary_prompt(transcript: str, language_name: str) -> str:
|
379 |
+
"""
|
380 |
+
Creates a single, dynamic and insistent prompt that instructs the AI
|
381 |
+
to output its findings in the specified language.
|
382 |
+
"""
|
383 |
+
return f"""
|
384 |
+
You are an expert AI assistant. Your task is to analyze the following meeting transcript and extract key information into a structured JSON object.
|
385 |
+
|
386 |
+
**Primary Goal:** Analyze the provided transcript and generate a structured summary.
|
387 |
+
|
388 |
+
**CRITICAL LANGUAGE INSTRUCTION:** All text in your final JSON response must be written in the following language: **{language_name}**. There are no exceptions.
|
389 |
+
|
390 |
+
**ANALYSIS INSTRUCTIONS:**
|
391 |
+
1. Read the entire transcript to understand its context.
|
392 |
+
2. Identify a concise title for the meeting.
|
393 |
+
3. Identify all participants mentioned.
|
394 |
+
4. Write a brief paragraph summarizing the core themes and outcomes.
|
395 |
+
5. List all clear and agreed-upon decisions.
|
396 |
+
6. Extract all clear action items, identifying the task, who it was assigned to, the due date if mentioned, and the context.
|
397 |
+
|
398 |
+
**OUTPUT INSTRUCTIONS:**
|
399 |
+
- Respond ONLY with a valid JSON object.
|
400 |
+
- The JSON must use these exact keys: "meeting_title", "participants", "meeting_summary", "decisions_made", "action_items".
|
401 |
+
- **Language Check:** Before you finalize your response, verify that every single string value within the JSON is written in **{language_name}**.
|
402 |
+
|
403 |
+
**TRANSCRIPT TO ANALYZE:**
|
404 |
+
\"\"\"
|
405 |
+
{transcript}
|
406 |
+
\"\"\"
|
407 |
+
"""
|
408 |
+
|
409 |
+
@lru_cache(maxsize=None)
|
410 |
+
def get_language_name(language_code: str) -> str:
|
411 |
+
"""Converts a two-letter language code (e.g., 'es') to its full name (e.g., 'Spanish')."""
|
412 |
+
try:
|
413 |
+
lang = pycountry.languages.get(alpha_2=language_code)
|
414 |
+
return lang.name if lang else language_code
|
415 |
+
except Exception:
|
416 |
+
return language_code
|
417 |
+
|
418 |
+
def format_summary_to_markdown(summary_json: dict) -> str:
|
419 |
+
"""Converts the structured JSON summary into a formatted Markdown string."""
|
420 |
+
|
421 |
+
summary_data = {k.lower().replace(" ", "_"): v for k, v in summary_json.items()}
|
422 |
+
|
423 |
+
if not summary_data.get("meeting_summary") and not summary_data.get("decisions_made") and not summary_data.get("action_items"):
|
424 |
+
return "The provided transcript was too short or lacked sufficient content to generate a detailed summary."
|
425 |
+
|
426 |
+
markdown_parts = []
|
427 |
+
|
428 |
+
if title := summary_data.get("meeting_title"):
|
429 |
+
markdown_parts.append(f"### {title}\n")
|
430 |
+
|
431 |
+
if summary := summary_data.get("meeting_summary"):
|
432 |
+
markdown_parts.append("📝 **Meeting Summary:**")
|
433 |
+
markdown_parts.append(summary)
|
434 |
+
markdown_parts.append("")
|
435 |
+
|
436 |
+
if decisions := summary_data.get("decisions_made"):
|
437 |
+
markdown_parts.append("📌 **Decisions Made:**")
|
438 |
+
for decision in decisions:
|
439 |
+
markdown_parts.append(f"- {decision}")
|
440 |
+
markdown_parts.append("")
|
441 |
+
|
442 |
+
if action_items := summary_data.get("action_items"):
|
443 |
+
markdown_parts.append("✅ **Action Items:**")
|
444 |
+
for item in action_items:
|
445 |
+
task = item.get('task', item.get('Task', 'N/A'))
|
446 |
+
assigned_to = item.get('assigned_to', item.get('Assigned To', 'Not specified'))
|
447 |
+
due_date = item.get('due_date', item.get('Due Date', 'Not specified'))
|
448 |
+
context = item.get('context', item.get('Context', ''))
|
449 |
|
450 |
+
markdown_parts.append(f"- **Task**: {task}")
|
451 |
+
markdown_parts.append(f" - **Assigned To**: {assigned_to}")
|
452 |
+
markdown_parts.append(f" - **Due Date**: {due_date}")
|
453 |
+
if context:
|
454 |
+
markdown_parts.append(f" - **Context**: {context}")
|
455 |
+
markdown_parts.append("")
|
456 |
|
457 |
+
return "\n".join(markdown_parts)
|
|
utils.py
CHANGED
@@ -0,0 +1,96 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import subprocess
|
3 |
+
import tempfile
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def clean_transcript(text):
|
9 |
+
"""Clean the transcript by removing filler words and consolidating sentences."""
|
10 |
+
|
11 |
+
filler_words = [
|
12 |
+
r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b',
|
13 |
+
r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b',
|
14 |
+
r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b'
|
15 |
+
]
|
16 |
+
|
17 |
+
for word in filler_words:
|
18 |
+
text = re.sub(f"{word}", "", text, flags=re.IGNORECASE)
|
19 |
+
|
20 |
+
text = re.sub(r'\s+', ' ', text)
|
21 |
+
|
22 |
+
return text.strip()
|
23 |
+
|
24 |
+
def consolidate_similar_items(items):
|
25 |
+
"""Consolidate similar items to reduce repetition."""
|
26 |
+
if not items or len(items) <= 1:
|
27 |
+
return items
|
28 |
+
|
29 |
+
# Simple similarity measure based on word overlap
|
30 |
+
result = [items[0]]
|
31 |
+
for item in items[1:]:
|
32 |
+
# Convert to sets of words for comparison
|
33 |
+
item_words = set(item.lower().split())
|
34 |
+
|
35 |
+
# Check if this item is too similar to any existing item
|
36 |
+
too_similar = False
|
37 |
+
for existing_item in result:
|
38 |
+
existing_words = set(existing_item.lower().split())
|
39 |
+
# Calculate Jaccard similarity
|
40 |
+
intersection = len(item_words.intersection(existing_words))
|
41 |
+
union = len(item_words.union(existing_words))
|
42 |
+
if union > 0 and intersection / union > 0.6: # 60% similarity threshold
|
43 |
+
too_similar = True
|
44 |
+
break
|
45 |
+
|
46 |
+
if not too_similar:
|
47 |
+
result.append(item)
|
48 |
+
|
49 |
+
return result
|
50 |
+
|
51 |
+
|
52 |
+
def chunk_text(text, max_tokens=800):
|
53 |
+
words = text.split()
|
54 |
+
chunks = []
|
55 |
+
if len(words) > max_tokens:
|
56 |
+
for i in range(0, len(words), max_tokens):
|
57 |
+
chunk = " ".join(words[i:i + max_tokens])
|
58 |
+
chunks.append(chunk)
|
59 |
+
return chunks
|
60 |
+
else:
|
61 |
+
return [text]
|
62 |
+
|
63 |
+
|
64 |
+
def webm_to_wav(webm_bytes: bytes) -> str:
|
65 |
+
"""
|
66 |
+
Converts webm audio bytes to a wav file and returns the path to the wav file.
|
67 |
+
"""
|
68 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file:
|
69 |
+
webm_path = webm_file.name
|
70 |
+
webm_file.write(webm_bytes)
|
71 |
+
|
72 |
+
wav_path = webm_path.replace(".webm", ".wav")
|
73 |
+
|
74 |
+
try:
|
75 |
+
subprocess.run([
|
76 |
+
"ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path
|
77 |
+
], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
78 |
+
finally:
|
79 |
+
os.remove(webm_path)
|
80 |
+
|
81 |
+
return wav_path
|
82 |
+
|
83 |
+
|
84 |
+
def preprocess_transcript(transcript: str) -> str:
|
85 |
+
"""
|
86 |
+
Cleans and normalizes the transcript.
|
87 |
+
- Removes extra whitespace.
|
88 |
+
- Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:"
|
89 |
+
"""
|
90 |
+
# Simple cleaning
|
91 |
+
text = re.sub(r'\s+', ' ', transcript).strip()
|
92 |
+
|
93 |
+
# Advanced: A future step could be to normalize speaker names
|
94 |
+
# e.g., mapping "Bob's update" to "Bob:"
|
95 |
+
|
96 |
+
return text
|