devcom33 commited on
Commit
6f46074
·
1 Parent(s): 08654c0

Updated FastAPI code

Browse files
Files changed (6) hide show
  1. app.py +154 -27
  2. config.py +10 -3
  3. models.py +51 -26
  4. requirements.txt +5 -1
  5. services.py +425 -13
  6. utils.py +96 -0
app.py CHANGED
@@ -1,68 +1,161 @@
1
  import logging
2
- import sys
3
- from fastapi import FastAPI, UploadFile, File, HTTPException
 
 
 
 
 
4
  from pydantic import BaseModel
5
  import config
6
  from models import load_whisper, load_summarizer, load_spacy
7
- from services import process_transcription, process_summary
8
-
 
 
9
 
10
  logger = logging.getLogger(__name__)
11
 
12
  app = FastAPI(
13
  title="Transcription and Summarization API",
14
  description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
15
- version="1.0.0"
16
  )
17
 
 
 
 
 
 
 
18
  logger.info("Application starting up - loading models...")
19
  whisper_model = load_whisper(config)
20
  summarizer_pipeline = load_summarizer(config)
21
  nlp_spacy = load_spacy(config)
22
  logger.info("Model loading complete.")
23
 
 
 
 
 
 
 
 
 
 
 
24
  if not whisper_model:
25
- logger.critical("Whisper model failed to load. Transcription endpoint will be unavailable.")
 
 
26
  if not summarizer_pipeline:
27
- logger.critical("Summarizer pipeline failed to load. Summarization endpoint will be unavailable.")
 
 
28
  if not nlp_spacy:
29
- logger.warning("SpaCy model failed to load. Summarization will proceed without spaCy preprocessing.")
30
-
 
31
 
32
  class TranscriptInput(BaseModel):
33
  transcript: str
34
-
35
 
36
  @app.get("/health")
37
  def health():
38
- return {"status": "ok",
 
39
  "whisper_loaded": whisper_model is not None,
40
  "summarizer_loaded": summarizer_pipeline is not None,
41
- "spacy_loaded": nlp_spacy is not None
42
- }
43
-
44
 
45
  @app.post("/transcribe")
46
- async def transcription(audio_file : UploadFile = File(...)):
 
 
 
47
  if whisper_model is None:
48
- raise HTTPException(status_code=503, detail="Transcription service unavailable.")
49
 
50
  try:
 
 
51
  content = await audio_file.read()
52
- transcript, info = process_transcription(content, whisper_model)
53
- logger.info(f"Transcription successful. Language: {info.language}")
54
- return {"transcript": transcript}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  except ValueError as ve:
56
- logger.error(f"Value error during transcription processing: {ve}")
57
- raise HTTPException(status_code=400, detail=str(ve))
58
  except Exception as e:
59
- logger.error(f"Unhandled error during transcription: {e}", exc_info=True)
60
  raise HTTPException(status_code=500, detail="Internal server error during transcription.")
61
 
62
-
63
  @app.post("/summarize")
64
- def summarize(input: TranscriptInput):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
 
 
66
  if summarizer_pipeline is None:
67
  raise HTTPException(status_code=503, detail="Summarization service unavailable.")
68
  if not input.transcript:
@@ -71,11 +164,45 @@ def summarize(input: TranscriptInput):
71
  try:
72
  summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
73
  return {"summary": summary}
74
-
75
  except ValueError as ve:
76
  logger.error(f"Value error during summary processing: {ve}")
77
  raise HTTPException(status_code=400, detail=str(ve))
78
-
79
  except Exception as e:
80
  logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
81
- raise HTTPException(status_code=500, detail="Internal server error during summarization.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import logging
2
+ import traceback
3
+ import os
4
+ import json
5
+ import re
6
+ import time
7
+ from fastapi import FastAPI, UploadFile, File, HTTPException, Form
8
+ from fastapi.middleware.cors import CORSMiddleware
9
  from pydantic import BaseModel
10
  import config
11
  from models import load_whisper, load_summarizer, load_spacy
12
+ from services import process_transcription, process_summary, create_enhanced_summary_prompt, format_summary_to_markdown, get_language_name
13
+ from utils import webm_to_wav
14
+ import google.generativeai as genai
15
+ from google.api_core import exceptions as api_core_exceptions
16
 
17
  logger = logging.getLogger(__name__)
18
 
19
  app = FastAPI(
20
  title="Transcription and Summarization API",
21
  description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
22
+ version="1.0.0",
23
  )
24
 
25
+ api_key = os.getenv("GEMINI_API_KEY")
26
+ if not api_key:
27
+ logger.critical("GEMINI_API_KEY environment variable not set.")
28
+ else:
29
+ genai.configure(api_key=api_key)
30
+
31
  logger.info("Application starting up - loading models...")
32
  whisper_model = load_whisper(config)
33
  summarizer_pipeline = load_summarizer(config)
34
  nlp_spacy = load_spacy(config)
35
  logger.info("Model loading complete.")
36
 
37
+ origins = ["http://localhost:8080"]
38
+
39
+ app.add_middleware(
40
+ CORSMiddleware,
41
+ allow_origins=origins,
42
+ allow_credentials=True,
43
+ allow_methods=["*"],
44
+ allow_headers=["*"],
45
+ )
46
+
47
  if not whisper_model:
48
+ logger.critical(
49
+ "Whisper model failed to load. Transcription endpoint will be unavailable."
50
+ )
51
  if not summarizer_pipeline:
52
+ logger.critical(
53
+ "Summarizer pipeline failed to load. Summarization endpoint will be unavailable."
54
+ )
55
  if not nlp_spacy:
56
+ logger.warning(
57
+ "SpaCy model failed to load. Summarization will proceed without spaCy preprocessing."
58
+ )
59
 
60
  class TranscriptInput(BaseModel):
61
  transcript: str
62
+ language: str = "en"
63
 
64
  @app.get("/health")
65
  def health():
66
+ return {
67
+ "status": "ok",
68
  "whisper_loaded": whisper_model is not None,
69
  "summarizer_loaded": summarizer_pipeline is not None,
70
+ "spacy_loaded": nlp_spacy is not None,
71
+ }
 
72
 
73
  @app.post("/transcribe")
74
+ async def transcription(
75
+ audio_file: UploadFile = File(...),
76
+ enable_diarization: bool = Form(False)
77
+ ):
78
  if whisper_model is None:
79
+ raise HTTPException(status_code=503, detail="Transcription service unavailable.")
80
 
81
  try:
82
+ start_time = time.time()
83
+ content_type = audio_file.content_type
84
  content = await audio_file.read()
85
+
86
+ if content_type in ["audio/webm", "video/webm"]:
87
+ wav_path = webm_to_wav(content)
88
+ with open(wav_path, "rb") as f:
89
+ wav_bytes = f.read()
90
+ os.remove(wav_path)
91
+ elif content_type == "audio/wav":
92
+ wav_bytes = content
93
+ else:
94
+ raise HTTPException(status_code=400, detail="Unsupported audio format. Use .webm or .wav")
95
+
96
+ transcript, info, diarized_segments = process_transcription(
97
+ wav_bytes,
98
+ whisper_model,
99
+ enable_diarization=enable_diarization
100
+ )
101
+
102
+ processing_time = time.time() - start_time
103
+ logger.info(f"Transcription successful. Language: {info.language}, Time: {processing_time:.2f}s")
104
+
105
+ speakers = []
106
+ if diarized_segments:
107
+ for segment in diarized_segments:
108
+ if segment["speaker"] not in speakers:
109
+ speakers.append(segment["speaker"])
110
+
111
+ response = {
112
+ "transcript": transcript,
113
+ "language": info.language,
114
+ "duration": info.duration,
115
+ }
116
+
117
+ if enable_diarization and diarized_segments:
118
+ response["speakers"] = speakers
119
+ response["segments"] = diarized_segments
120
+
121
+ return response
122
+ except HTTPException as http_exc:
123
+ raise http_exc
124
  except ValueError as ve:
125
+ logger.error(f"Value error during transcription processing: {ve}")
126
+ raise HTTPException(status_code=400, detail=str(ve))
127
  except Exception as e:
128
+ logger.error(f"Unhandled error during transcription: {e}\n{traceback.format_exc()}")
129
  raise HTTPException(status_code=500, detail="Internal server error during transcription.")
130
 
 
131
  @app.post("/summarize")
132
+ async def summarize(input: TranscriptInput):
133
+ if not input.transcript or not input.transcript.strip():
134
+ raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
135
+ try:
136
+ prompt = f"""
137
+ Summarize the following text concisely:
138
+ Transcript:
139
+ \"\"\"
140
+ {input.transcript}
141
+ \"\"\"
142
+ """
143
+ model = genai.GenerativeModel('gemini-1.5-flash')
144
+ response = model.generate_content(prompt)
145
+ logger.info(f"Gemini /summarize response text: '{response.text}'")
146
+ return {"summary": response.text}
147
+ except api_core_exceptions.ResourceExhausted as e:
148
+ logger.error(f"Gemini API rate limit exceeded: {e}")
149
+ raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
150
+ except genai.types.BlockedPromptError as e:
151
+ logger.error(f"The prompt was blocked: {e}")
152
+ raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
153
+ except Exception as e:
154
+ logger.error(f"An unexpected error occurred during basic summarization: {e}", exc_info=True)
155
+ raise HTTPException(status_code=500, detail=str(e))
156
 
157
+ @app.post("/smart-summary")
158
+ def smart_summarize(input: TranscriptInput):
159
  if summarizer_pipeline is None:
160
  raise HTTPException(status_code=503, detail="Summarization service unavailable.")
161
  if not input.transcript:
 
164
  try:
165
  summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
166
  return {"summary": summary}
 
167
  except ValueError as ve:
168
  logger.error(f"Value error during summary processing: {ve}")
169
  raise HTTPException(status_code=400, detail=str(ve))
 
170
  except Exception as e:
171
  logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
172
+ raise HTTPException(status_code=500, detail="Internal server error during summarization.")
173
+
174
+ @app.post("/enhanced-summary")
175
+ async def enhanced_summary(input: TranscriptInput):
176
+ if not input.transcript or not input.transcript.strip():
177
+ raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
178
+
179
+ try:
180
+ language_name = get_language_name(input.language)
181
+ prompt = create_enhanced_summary_prompt(input.transcript, language_name)
182
+ model = genai.GenerativeModel('gemini-1.5-flash')
183
+
184
+ response = model.generate_content(
185
+ contents=prompt,
186
+ generation_config=genai.GenerationConfig(response_mime_type="application/json")
187
+ )
188
+
189
+ try:
190
+ cleaned_text = re.sub(r"```json\s*(.*)\s*```", r"\1", response.text, flags=re.DOTALL)
191
+ summary_json = json.loads(cleaned_text)
192
+ logger.info(f"Received JSON from Gemini: {summary_json}")
193
+ except (json.JSONDecodeError, TypeError) as e:
194
+ logger.error(f"Failed to parse LLM response as JSON: {e}\nResponse text: {response.text}")
195
+ raise HTTPException(status_code=500, detail="Failed to generate a structured summary due to an invalid model response.")
196
+
197
+ formatted_markdown = format_summary_to_markdown(summary_json)
198
+ logger.info(f"Formatted Markdown: {formatted_markdown}")
199
+ return {"summary": formatted_markdown}
200
+ except api_core_exceptions.ResourceExhausted as e:
201
+ logger.error(f"Gemini API rate limit exceeded: {e}")
202
+ raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
203
+ except genai.types.BlockedPromptError as e:
204
+ logger.error(f"The prompt was blocked: {e}")
205
+ raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
206
+ except Exception as e:
207
+ logger.error(f"An unexpected error occurred during summarization: {e}", exc_info=True)
208
+ raise HTTPException(status_code=500, detail="An internal server error occurred during summarization.")
config.py CHANGED
@@ -1,15 +1,22 @@
1
- import os
2
  import psutil
 
 
 
 
3
 
4
- WHISPER_MODEL_NAME = "Systran/faster-whisper-tiny"
 
5
  WHISPER_DEVICE = "cpu"
6
  WHISPER_COMPUTE_TYPE = "int8"
7
-
8
  PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
9
 
 
10
  SUMMARIZER_MODEL = "facebook/bart-large-cnn"
11
  SUMMARIZER_MAX_LENGTH = 150
12
  SUMMARIZER_MIN_LENGTH = 50
13
 
14
  SPACY_MODEL = "en_core_web_sm"
15
  CPU_THREADS = max(1, psutil.cpu_count(logical=False))
 
 
 
 
 
1
  import psutil
2
+ import os
3
+ from dotenv import load_dotenv
4
+ import nltk
5
+
6
 
7
+ load_dotenv()
8
+ WHISPER_MODEL_NAME = "tiny"
9
  WHISPER_DEVICE = "cpu"
10
  WHISPER_COMPUTE_TYPE = "int8"
 
11
  PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
12
 
13
+ #SUMMARIZER_MODEL = "google/flan-t5-base"
14
  SUMMARIZER_MODEL = "facebook/bart-large-cnn"
15
  SUMMARIZER_MAX_LENGTH = 150
16
  SUMMARIZER_MIN_LENGTH = 50
17
 
18
  SPACY_MODEL = "en_core_web_sm"
19
  CPU_THREADS = max(1, psutil.cpu_count(logical=False))
20
+
21
+ if not PYANNOTE_AUTH_TOKEN:
22
+ raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")
models.py CHANGED
@@ -1,61 +1,84 @@
1
- import os
2
- os.environ['HF_HOME'] = '/tmp/huggingface'
3
- os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
4
-
5
- os.environ['HUGGINGFACE_HUB_CACHE'] = '/tmp/huggingface'
6
-
7
- from huggingface_hub import snapshot_download
8
- import spacy
9
  import logging
10
  from faster_whisper import WhisperModel
 
11
  from transformers import pipeline
 
 
 
 
12
 
13
  logger = logging.getLogger(__name__)
 
 
14
 
15
  def load_whisper(config):
16
  logger.info("Loading Whisper model...")
17
-
18
  try:
19
- cache_dir = "/tmp/hf-cache"
20
- os.makedirs(cache_dir, exist_ok=True)
21
-
22
- model_dir = snapshot_download(
23
- repo_id=config.WHISPER_MODEL_NAME,
24
- cache_dir=cache_dir,
25
- token=os.getenv("HUGGINGFACE_API_KEY")
26
- )
27
-
28
  model = WhisperModel(
29
- model_dir,
30
  device=config.WHISPER_DEVICE,
31
  compute_type=config.WHISPER_COMPUTE_TYPE,
32
- cpu_threads=config.CPU_THREADS
 
 
 
33
  )
34
 
35
- logger.info(f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded from {model_dir} on {config.WHISPER_DEVICE}.")
36
  return model
37
-
38
  except Exception as e:
39
  logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
40
  return None
41
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  def load_summarizer(config):
43
  logger.info("Loading Summarization pipeline...")
44
  try:
45
- summarizer = pipeline("summarization", model=config.SUMMARIZER_MODEL, from_tf=True)
 
 
 
 
46
  logger.info("Summarization pipeline loaded.")
47
  return summarizer
48
  except Exception as e:
49
  logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
50
  return None
51
 
 
52
  def load_spacy(config):
53
  logger.info("Loading spaCy model...")
54
 
55
  try:
56
  nlp = spacy.load("en_core_web_sm")
57
  logger.info("spaCy model 'en_core_web_sm' loaded.")
58
-
59
  return nlp
60
 
61
  except OSError:
@@ -66,9 +89,11 @@ def load_spacy(config):
66
  nlp = spacy.load("en_core_web_sm")
67
  logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
68
  return nlp
69
-
70
  except Exception as download_e:
71
- logger.error(f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}")
 
 
72
  return None
73
 
74
  except Exception as e:
 
 
 
 
 
 
 
 
 
1
  import logging
2
  from faster_whisper import WhisperModel
3
+ import spacy
4
  from transformers import pipeline
5
+ import os
6
+ import torch
7
+ from pyannote.audio import Pipeline
8
+
9
 
10
  logger = logging.getLogger(__name__)
11
+ _diarize_model = None
12
+
13
 
14
  def load_whisper(config):
15
  logger.info("Loading Whisper model...")
 
16
  try:
 
 
 
 
 
 
 
 
 
17
  model = WhisperModel(
18
+ config.WHISPER_MODEL_NAME,
19
  device=config.WHISPER_DEVICE,
20
  compute_type=config.WHISPER_COMPUTE_TYPE,
21
+ cpu_threads=config.CPU_THREADS,
22
+ )
23
+ logger.info(
24
+ f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded on {config.WHISPER_DEVICE}."
25
  )
26
 
 
27
  return model
 
28
  except Exception as e:
29
  logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
30
  return None
31
 
32
+
33
+ def load_diarization(config):
34
+ global _diarize_model
35
+ logger.info("Loading PYANNOTE model...")
36
+
37
+ if _diarize_model is None and hasattr(config, "PYANNOTE_AUTH_TOKEN"):
38
+ try:
39
+ logger.info("Loading diarization model")
40
+ _diarize_model = Pipeline.from_pretrained(
41
+ "pyannote/speaker-diarization-3.0",
42
+ use_auth_token=config.PYANNOTE_AUTH_TOKEN,
43
+ )
44
+
45
+ # Move to GPU if available
46
+ if (
47
+ hasattr(config, "WHISPER_DEVICE")
48
+ and config.WHISPER_DEVICE == "cuda"
49
+ and torch.cuda.is_available()
50
+ ):
51
+ _diarize_model = _diarize_model.to(torch.device("cuda"))
52
+
53
+ logger.info("Diarization model loaded successfully")
54
+ except Exception as e:
55
+ logger.error(f"Failed to load diarization model: {e}", exc_info=True)
56
+
57
+ return _diarize_model
58
+
59
+
60
  def load_summarizer(config):
61
  logger.info("Loading Summarization pipeline...")
62
  try:
63
+ summarizer = pipeline(
64
+ "text2text-generation",
65
+ model=config.SUMMARIZER_MODEL,
66
+ device=0 if torch.cuda.is_available() else -1,
67
+ )
68
  logger.info("Summarization pipeline loaded.")
69
  return summarizer
70
  except Exception as e:
71
  logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
72
  return None
73
 
74
+
75
  def load_spacy(config):
76
  logger.info("Loading spaCy model...")
77
 
78
  try:
79
  nlp = spacy.load("en_core_web_sm")
80
  logger.info("spaCy model 'en_core_web_sm' loaded.")
81
+
82
  return nlp
83
 
84
  except OSError:
 
89
  nlp = spacy.load("en_core_web_sm")
90
  logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
91
  return nlp
92
+
93
  except Exception as download_e:
94
+ logger.error(
95
+ f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}"
96
+ )
97
  return None
98
 
99
  except Exception as e:
requirements.txt CHANGED
@@ -7,4 +7,8 @@ spacy
7
  pydub
8
  psutil
9
  python-multipart
10
- en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
 
 
 
 
 
7
  pydub
8
  psutil
9
  python-multipart
10
+ en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
11
+ pyannote
12
+ nltk
13
+ google.generativeai
14
+ google
services.py CHANGED
@@ -1,10 +1,30 @@
1
  import logging
2
  import os
3
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
 
5
  logger = logging.getLogger(__name__)
6
 
7
- def process_transcription(audio_content: bytes, whisper_model):
 
 
 
8
  if not whisper_model:
9
  raise ValueError("Whisper model not loaded.")
10
 
@@ -14,9 +34,123 @@ def process_transcription(audio_content: bytes, whisper_model):
14
  temp_file_path = temp_file.name
15
  temp_file.write(audio_content)
16
 
17
- segments, info = whisper_model.transcribe(temp_file_path, beam_size=5)
 
 
 
18
  transcript = " ".join([seg.text.strip() for seg in segments])
19
- return transcript, info
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  finally:
21
  if temp_file_path and os.path.exists(temp_file_path):
22
  os.remove(temp_file_path)
@@ -25,21 +159,299 @@ def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
25
  if not summarizer_pipeline:
26
  raise ValueError("Summarizer model not loaded.")
27
 
28
- processed_text = text
 
 
 
 
 
 
29
  if nlp_spacy:
30
  try:
31
- doc = nlp_spacy(text)
32
  sentences = [sent.text.strip() for sent in doc.sents]
33
  processed_text = " ".join(sentences)
34
  except Exception as e:
35
  logger.error(f"SpaCy processing failed: {e}", exc_info=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
- summary_output = summarizer_pipeline(
38
- processed_text,
39
- max_length=config.SUMMARIZER_MAX_LENGTH,
40
- min_length=config.SUMMARIZER_MIN_LENGTH,
41
- do_sample=False
42
- )
43
 
44
- final_summary = summary_output[0]['summary_text']
45
- return final_summary
 
1
  import logging
2
  import os
3
  import tempfile
4
+ import nltk
5
+ from nltk.tokenize import sent_tokenize
6
+ from nltk.tokenize.punkt import PunktSentenceTokenizer
7
+ from nltk.data import load
8
+ import pickle
9
+ import re
10
+ from utils import clean_transcript, consolidate_similar_items, chunk_text
11
+ from transformers import pipeline
12
+ import config
13
+ from models import load_diarization
14
+ import wave
15
+ import gc
16
+ import torch
17
+ import time
18
+ import pycountry
19
+ from functools import lru_cache
20
+
21
 
22
  logger = logging.getLogger(__name__)
23
 
24
+ _diarize_model = None
25
+
26
+ def process_transcription(audio_content: bytes, whisper_model, enable_diarization=False):
27
+ start = time.time()
28
  if not whisper_model:
29
  raise ValueError("Whisper model not loaded.")
30
 
 
34
  temp_file_path = temp_file.name
35
  temp_file.write(audio_content)
36
 
37
+ segments_gen, info = whisper_model.transcribe(temp_file_path, beam_size=5)
38
+
39
+ segments = list(segments_gen)
40
+
41
  transcript = " ".join([seg.text.strip() for seg in segments])
42
+
43
+
44
+ global _diarize_model
45
+
46
+ if not enable_diarization:
47
+ return transcript, info, None
48
+
49
+ if _diarize_model is None:
50
+ _diarize_model = load_diarization(config)
51
+
52
+ if _diarize_model is None:
53
+ logger.warning("Diarization model not available, returning transcript without speakers")
54
+ return transcript, info, None
55
+
56
+ with wave.open(temp_file_path, 'rb') as wav:
57
+ frames = wav.getnframes()
58
+ rate = wav.getframerate()
59
+ #calcul audio duration
60
+ audio_duration = frames / float(rate)
61
+
62
+
63
+ if audio_duration < 3.0:
64
+ logger.info(f"Audio too short ({audio_duration:.2f}s), skipping diarization")
65
+ diarized_segments = [{"speaker": "SPEAKER_0", "text": transcript}]
66
+ diarized_transcript = f"[SPEAKER_0]: {transcript}"
67
+ return diarized_transcript, info, diarized_segments
68
+
69
+
70
+ logger.info("Running speaker diarization")
71
+ diarization = _diarize_model(temp_file_path)
72
+
73
+ # Extract diarization segments
74
+ diarize_segments = []
75
+ for turn, _, speaker in diarization.itertracks(yield_label=True):
76
+ diarize_segments.append({
77
+ "speaker": f"SPEAKER_{speaker.replace('SPEAKER_', '')}",
78
+ "start": turn.start,
79
+ "end": turn.end
80
+ })
81
+
82
+ diarized_segments = []
83
+
84
+ for segment in segments:
85
+ # Find best matching speaker based on time overlap
86
+ best_speaker = None
87
+ max_overlap = 0
88
+ seg_start = segment.start
89
+ seg_end = segment.end
90
+
91
+ for diar_seg in diarize_segments:
92
+ diar_start = diar_seg["start"]
93
+ diar_end = diar_seg["end"]
94
+ # Calculate overlap
95
+ overlap_start = max(seg_start, diar_start)
96
+ overlap_end = min(seg_end, diar_end)
97
+
98
+ if overlap_end > overlap_start:
99
+ overlap = overlap_end - overlap_start
100
+ if overlap > max_overlap:
101
+ max_overlap = overlap
102
+ best_speaker = diar_seg["speaker"]
103
+
104
+ # If no overlap found, assign to the closest speaker
105
+ if best_speaker is None:
106
+ min_distance = float('inf')
107
+ for diar_seg in diarize_segments:
108
+ # Distance to start of segment
109
+ dist_start = abs(seg_start - diar_seg["start"])
110
+ # Distance to end of segment
111
+ dist_end = abs(seg_end - diar_seg["end"])
112
+ # Take the minimum
113
+ dist = min(dist_start, dist_end)
114
+
115
+ if dist < min_distance:
116
+ min_distance = dist
117
+ best_speaker = diar_seg["speaker"]
118
+
119
+ diarized_segments.append({
120
+ "speaker": best_speaker or "SPEAKER_UNKNOWN",
121
+ "text": segment.text,
122
+ "start": segment.start,
123
+ "end": segment.end
124
+ })
125
+
126
+ # Format diarized transcript
127
+ diarized_transcript = ""
128
+ current_speaker = None
129
+
130
+ for segment in diarized_segments:
131
+ speaker = segment["speaker"]
132
+ text = segment["text"].strip()
133
+
134
+ if not text:
135
+ continue
136
+
137
+ if speaker != current_speaker:
138
+ diarized_transcript += f"\n[{speaker}]: {text}"
139
+ current_speaker = speaker
140
+ else:
141
+ diarized_transcript += f" {text}"
142
+
143
+ # Clean up memory
144
+ gc.collect()
145
+ if torch.cuda.is_available():
146
+ torch.cuda.empty_cache()
147
+
148
+ end = time.time()
149
+
150
+ logger.info("time : ", (end - start) * 10**3)
151
+
152
+ return diarized_transcript, info, diarized_segments
153
+
154
  finally:
155
  if temp_file_path and os.path.exists(temp_file_path):
156
  os.remove(temp_file_path)
 
159
  if not summarizer_pipeline:
160
  raise ValueError("Summarizer model not loaded.")
161
 
162
+ #clean transcript
163
+ cleaned_transcript = clean_transcript(text)
164
+
165
+ processed_text = cleaned_transcript
166
+
167
+ doc = None
168
+
169
  if nlp_spacy:
170
  try:
171
+ doc = nlp_spacy(processed_text)
172
  sentences = [sent.text.strip() for sent in doc.sents]
173
  processed_text = " ".join(sentences)
174
  except Exception as e:
175
  logger.error(f"SpaCy processing failed: {e}", exc_info=True)
176
+
177
+ categories = {
178
+ "meeting_title": [],
179
+ "intro": [],
180
+ "topics": [],
181
+ "decisions": [],
182
+ "action_items": [],
183
+ "questions": [],
184
+ "deadlines": [],
185
+ "participants": [],
186
+ "overall_summary": [],
187
+ "conclusion": []
188
+ }
189
+
190
+ # extraction meeting title
191
+ title_pattern = r'(meeting|call|session|discussion) (about|on|for|regarding) ([^.]+)'
192
+ title_matches = re.findall(title_pattern, processed_text, re.IGNORECASE)
193
+
194
+ if title_matches:
195
+ categories["meeting_title"].append(title_matches[0][2].strip())
196
+
197
+ if doc:
198
+ sentences = [sent.text.strip() for sent in doc.sents]
199
+ else:
200
+ try:
201
+ with open("/home/heymouad/nltk_data/tokenizers/punkt/english.pickle", "rb") as f:
202
+ tokenizer = pickle.load(f)
203
+ sentences = tokenizer.tokenize(processed_text)
204
+ except Exception as e:
205
+ logger.error(f"NLTK tokenization failed: {e}", exc_info=True)
206
+ sentences = sent_tokenize(processed_text)
207
+
208
+ # Find participants
209
+ people = set()
210
+ if doc:
211
+ for ent in doc.ents:
212
+ if ent.label_ == "PERSON":
213
+ person = ent.text.strip()
214
+ if len(person) > 2:
215
+ people.add(person)
216
+
217
+ if people:
218
+ categories["participants"] = list(people)
219
+
220
+ try:
221
+ # chunked the text because of limits of bart model
222
+ logger.info(processed_text[::100])
223
+ processed_text = chunk_text(processed_text)
224
+ parts_summaries = []
225
+
226
+ for chunk in processed_text:
227
+ result = summarizer_pipeline(chunk, max_length=150, min_length=30, do_sample=False)
228
+ if result and isinstance(result, list) and len(result) > 0:
229
+ part_summary = result[0].get('summary_text', '')
230
+ if part_summary:
231
+ parts_summaries.append(part_summary)
232
+
233
+ overall_summary = " ".join(parts_summaries)
234
+ overall_summary = summarizer_pipeline(overall_summary, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
235
+ categories["overall_summary"] = [overall_summary]
236
+
237
+ except Exception as e:
238
+ logger.error(f"Summarization failed: {e}", exc_info=True)
239
+ categories["overall_summary"] = ["Failed to generate overall summary."]
240
+
241
+ # Process each sentence
242
+ for i, sentence in enumerate(sentences):
243
+ sentence = sentence.strip()
244
+ if not sentence:
245
+ continue
246
+
247
+ # Check for action items
248
+ if (re.search(r'(need to|will|shall|must|should|have to|assigned to|responsible for|task|action item|to-do|follow up|take care of)',
249
+ sentence, re.IGNORECASE) and
250
+ re.search(r'(we|you|I|they|he|she|team|group|department)', sentence, re.IGNORECASE)):
251
+
252
+ categories["action_items"].append(sentence)
253
+ continue
254
+
255
+ # Check for decisions
256
+ if re.search(r'(decided|agreed|conclusion|resolved|approved|rejected|consensus|finalized|confirmed|determined)',
257
+ sentence, re.IGNORECASE):
258
+ categories["decisions"].append(sentence)
259
+ continue
260
+
261
+ # Check for deadlines/timing with stronger patterns
262
+ if re.search(r'(by|due|deadline|schedule|date|tomorrow|next week|month|calendar|remind|upcoming|on|at|until)',
263
+ sentence, re.IGNORECASE) and re.search(r'(time|day|week|month|year|hour|minute)', sentence, re.IGNORECASE):
264
+ categories["deadlines"].append(sentence)
265
+ continue
266
+
267
+ # Check for questions/issues
268
+ if (re.search(r'(\?|issue|problem|concern|question|clarif|wonder|how|what|when|where|why|who)',
269
+ sentence, re.IGNORECASE) and
270
+ not re.search(r'(answer|answered|resolved|solved)', sentence, re.IGNORECASE)):
271
+ categories["questions"].append(sentence)
272
+ continue
273
+
274
+ # Check for intro statements
275
+ if i < len(sentences) // 10: # First 10% of sentences
276
+ if re.search(r'(welcome|begin|start|agenda|today|discuss|meeting|introduce|opening|good morning|hello|topic)',
277
+ sentence, re.IGNORECASE):
278
+ categories["intro"].append(sentence)
279
+ continue
280
+
281
+ # Check for conclusion statements
282
+ if i > len(sentences) * 9 // 10: # Last 10% of sentences
283
+ if re.search(r'(conclude|end|wrap|summary|thank|next meeting|follow up|adjourn|goodbye|bye|closing)',
284
+ sentence, re.IGNORECASE):
285
+ categories["conclusion"].append(sentence)
286
+ continue
287
+
288
+ # Everything else is considered a topic if it has substance
289
+ if len(sentence.split()) > 3: # Avoid very short sentences
290
+ categories["topics"].append(sentence)
291
+
292
+ # Process categories to avoid repetition and consolidate related points
293
+ for category in categories:
294
+ if category in ["topics", "action_items", "decisions", "questions", "deadlines"]:
295
+ categories[category] = consolidate_similar_items(categories[category])
296
+
297
+ # Limit the number of topics to avoid overwhelming
298
+ if len(categories["topics"]) > 10:
299
+ # If we have a summarizer, try to generate a summary of topics
300
+ try:
301
+ topics_text = " ".join(categories["topics"])
302
+ topics_summary = summarizer_pipeline(topics_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
303
+ categories["topics"] = sent_tokenize(topics_summary)
304
+ except Exception as e:
305
+ logger.error(f"Topics summarization failed: {e}", exc_info=True)
306
+ # Otherwise just take the first few and last few topics
307
+ categories["topics"] = categories["topics"][:5] + categories["topics"][-5:]
308
+
309
+ # Add emojis to formatted output
310
+ formatted_summary = []
311
+
312
+ # Format meeting title if available
313
+ if categories.get("meeting_title"):
314
+ formatted_summary.append(f"📝 **Meeting Title:** {categories['meeting_title'][0]}")
315
+ formatted_summary.append("")
316
+
317
+ # Add overall summary
318
+ if categories.get("overall_summary"):
319
+ formatted_summary.append("📋 **Executive Summary:**")
320
+ formatted_summary.append(categories["overall_summary"][0])
321
+ formatted_summary.append("")
322
+
323
+ # Format participants
324
+ if categories["participants"]:
325
+ formatted_summary.append("👥 **Participants:**")
326
+ formatted_summary.append(", ".join(categories["participants"]))
327
+ formatted_summary.append("")
328
+
329
+ # Format intro
330
+ if categories["intro"]:
331
+ formatted_summary.append("🎯 **Meeting Introduction:**")
332
+ formatted_summary.append(" ".join(categories["intro"]))
333
+ formatted_summary.append("")
334
+
335
+ # Format main topics
336
+ if categories["topics"]:
337
+ formatted_summary.append("💡 **Key Topics:**")
338
+ for i, topic in enumerate(categories["topics"], 1):
339
+ formatted_summary.append(f"{i}. {topic}")
340
+ formatted_summary.append("")
341
+
342
+ # Format decisions
343
+ if categories["decisions"]:
344
+ formatted_summary.append("✅ **Decisions Made:**")
345
+ for decision in categories["decisions"]:
346
+ formatted_summary.append(f"• {decision}")
347
+ formatted_summary.append("")
348
+
349
+ # Format action items
350
+ if categories["action_items"]:
351
+ formatted_summary.append("📋 **Action Items:**")
352
+ for item in categories["action_items"]:
353
+ formatted_summary.append(f"• {item}")
354
+ formatted_summary.append("")
355
+
356
+ # Format questions
357
+ if categories["questions"]:
358
+ formatted_summary.append("❓ **Questions & Concerns:**")
359
+ for question in categories["questions"]:
360
+ formatted_summary.append(f"• {question}")
361
+ formatted_summary.append("")
362
+
363
+ # Format deadlines
364
+ if categories["deadlines"]:
365
+ formatted_summary.append("⏰ **Deadlines & Timing:**")
366
+ for deadline in categories["deadlines"]:
367
+ formatted_summary.append(f"• {deadline}")
368
+ formatted_summary.append("")
369
+
370
+ # Format conclusion
371
+ if categories["conclusion"]:
372
+ formatted_summary.append("🏁 **Conclusion:**")
373
+ formatted_summary.append(" ".join(categories["conclusion"]))
374
+
375
+ return "\n".join(formatted_summary)
376
+
377
+
378
+ def create_enhanced_summary_prompt(transcript: str, language_name: str) -> str:
379
+ """
380
+ Creates a single, dynamic and insistent prompt that instructs the AI
381
+ to output its findings in the specified language.
382
+ """
383
+ return f"""
384
+ You are an expert AI assistant. Your task is to analyze the following meeting transcript and extract key information into a structured JSON object.
385
+
386
+ **Primary Goal:** Analyze the provided transcript and generate a structured summary.
387
+
388
+ **CRITICAL LANGUAGE INSTRUCTION:** All text in your final JSON response must be written in the following language: **{language_name}**. There are no exceptions.
389
+
390
+ **ANALYSIS INSTRUCTIONS:**
391
+ 1. Read the entire transcript to understand its context.
392
+ 2. Identify a concise title for the meeting.
393
+ 3. Identify all participants mentioned.
394
+ 4. Write a brief paragraph summarizing the core themes and outcomes.
395
+ 5. List all clear and agreed-upon decisions.
396
+ 6. Extract all clear action items, identifying the task, who it was assigned to, the due date if mentioned, and the context.
397
+
398
+ **OUTPUT INSTRUCTIONS:**
399
+ - Respond ONLY with a valid JSON object.
400
+ - The JSON must use these exact keys: "meeting_title", "participants", "meeting_summary", "decisions_made", "action_items".
401
+ - **Language Check:** Before you finalize your response, verify that every single string value within the JSON is written in **{language_name}**.
402
+
403
+ **TRANSCRIPT TO ANALYZE:**
404
+ \"\"\"
405
+ {transcript}
406
+ \"\"\"
407
+ """
408
+
409
+ @lru_cache(maxsize=None)
410
+ def get_language_name(language_code: str) -> str:
411
+ """Converts a two-letter language code (e.g., 'es') to its full name (e.g., 'Spanish')."""
412
+ try:
413
+ lang = pycountry.languages.get(alpha_2=language_code)
414
+ return lang.name if lang else language_code
415
+ except Exception:
416
+ return language_code
417
+
418
+ def format_summary_to_markdown(summary_json: dict) -> str:
419
+ """Converts the structured JSON summary into a formatted Markdown string."""
420
+
421
+ summary_data = {k.lower().replace(" ", "_"): v for k, v in summary_json.items()}
422
+
423
+ if not summary_data.get("meeting_summary") and not summary_data.get("decisions_made") and not summary_data.get("action_items"):
424
+ return "The provided transcript was too short or lacked sufficient content to generate a detailed summary."
425
+
426
+ markdown_parts = []
427
+
428
+ if title := summary_data.get("meeting_title"):
429
+ markdown_parts.append(f"### {title}\n")
430
+
431
+ if summary := summary_data.get("meeting_summary"):
432
+ markdown_parts.append("📝 **Meeting Summary:**")
433
+ markdown_parts.append(summary)
434
+ markdown_parts.append("")
435
+
436
+ if decisions := summary_data.get("decisions_made"):
437
+ markdown_parts.append("📌 **Decisions Made:**")
438
+ for decision in decisions:
439
+ markdown_parts.append(f"- {decision}")
440
+ markdown_parts.append("")
441
+
442
+ if action_items := summary_data.get("action_items"):
443
+ markdown_parts.append("✅ **Action Items:**")
444
+ for item in action_items:
445
+ task = item.get('task', item.get('Task', 'N/A'))
446
+ assigned_to = item.get('assigned_to', item.get('Assigned To', 'Not specified'))
447
+ due_date = item.get('due_date', item.get('Due Date', 'Not specified'))
448
+ context = item.get('context', item.get('Context', ''))
449
 
450
+ markdown_parts.append(f"- **Task**: {task}")
451
+ markdown_parts.append(f" - **Assigned To**: {assigned_to}")
452
+ markdown_parts.append(f" - **Due Date**: {due_date}")
453
+ if context:
454
+ markdown_parts.append(f" - **Context**: {context}")
455
+ markdown_parts.append("")
456
 
457
+ return "\n".join(markdown_parts)
 
utils.py CHANGED
@@ -0,0 +1,96 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import subprocess
3
+ import tempfile
4
+ import os
5
+
6
+
7
+
8
+ def clean_transcript(text):
9
+ """Clean the transcript by removing filler words and consolidating sentences."""
10
+
11
+ filler_words = [
12
+ r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b',
13
+ r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b',
14
+ r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b'
15
+ ]
16
+
17
+ for word in filler_words:
18
+ text = re.sub(f"{word}", "", text, flags=re.IGNORECASE)
19
+
20
+ text = re.sub(r'\s+', ' ', text)
21
+
22
+ return text.strip()
23
+
24
+ def consolidate_similar_items(items):
25
+ """Consolidate similar items to reduce repetition."""
26
+ if not items or len(items) <= 1:
27
+ return items
28
+
29
+ # Simple similarity measure based on word overlap
30
+ result = [items[0]]
31
+ for item in items[1:]:
32
+ # Convert to sets of words for comparison
33
+ item_words = set(item.lower().split())
34
+
35
+ # Check if this item is too similar to any existing item
36
+ too_similar = False
37
+ for existing_item in result:
38
+ existing_words = set(existing_item.lower().split())
39
+ # Calculate Jaccard similarity
40
+ intersection = len(item_words.intersection(existing_words))
41
+ union = len(item_words.union(existing_words))
42
+ if union > 0 and intersection / union > 0.6: # 60% similarity threshold
43
+ too_similar = True
44
+ break
45
+
46
+ if not too_similar:
47
+ result.append(item)
48
+
49
+ return result
50
+
51
+
52
+ def chunk_text(text, max_tokens=800):
53
+ words = text.split()
54
+ chunks = []
55
+ if len(words) > max_tokens:
56
+ for i in range(0, len(words), max_tokens):
57
+ chunk = " ".join(words[i:i + max_tokens])
58
+ chunks.append(chunk)
59
+ return chunks
60
+ else:
61
+ return [text]
62
+
63
+
64
+ def webm_to_wav(webm_bytes: bytes) -> str:
65
+ """
66
+ Converts webm audio bytes to a wav file and returns the path to the wav file.
67
+ """
68
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file:
69
+ webm_path = webm_file.name
70
+ webm_file.write(webm_bytes)
71
+
72
+ wav_path = webm_path.replace(".webm", ".wav")
73
+
74
+ try:
75
+ subprocess.run([
76
+ "ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path
77
+ ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
78
+ finally:
79
+ os.remove(webm_path)
80
+
81
+ return wav_path
82
+
83
+
84
+ def preprocess_transcript(transcript: str) -> str:
85
+ """
86
+ Cleans and normalizes the transcript.
87
+ - Removes extra whitespace.
88
+ - Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:"
89
+ """
90
+ # Simple cleaning
91
+ text = re.sub(r'\s+', ' ', transcript).strip()
92
+
93
+ # Advanced: A future step could be to normalize speaker names
94
+ # e.g., mapping "Bob's update" to "Bob:"
95
+
96
+ return text