Spaces:

devml33
/

awalit

Running

App Files Files Community

devcom33 commited on 19 days ago

Commit

6f46074

1 Parent(s): 08654c0

Updated FastAPI code

Browse files

Files changed (6) hide show

app.py +154 -27
config.py +10 -3
models.py +51 -26
requirements.txt +5 -1
services.py +425 -13
utils.py +96 -0

app.py CHANGED Viewed

@@ -1,68 +1,161 @@
 import logging
-import sys
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from pydantic import BaseModel
 import config
 from models import load_whisper, load_summarizer, load_spacy
-from services import process_transcription, process_summary
 logger = logging.getLogger(__name__)
 app = FastAPI(
     title="Transcription and Summarization API",
     description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
-    version="1.0.0"
 )
 logger.info("Application starting up - loading models...")
 whisper_model = load_whisper(config)
 summarizer_pipeline = load_summarizer(config)
 nlp_spacy = load_spacy(config)
 logger.info("Model loading complete.")
 if not whisper_model:
-    logger.critical("Whisper model failed to load. Transcription endpoint will be unavailable.")
 if not summarizer_pipeline:
-    logger.critical("Summarizer pipeline failed to load. Summarization endpoint will be unavailable.")
 if not nlp_spacy:
-    logger.warning("SpaCy model failed to load. Summarization will proceed without spaCy preprocessing.")
 class TranscriptInput(BaseModel):
     transcript: str
 @app.get("/health")
 def health():
-    return {"status": "ok",
         "whisper_loaded": whisper_model is not None,
         "summarizer_loaded": summarizer_pipeline is not None,
-        "spacy_loaded": nlp_spacy is not None
-        }
 @app.post("/transcribe")
-async def transcription(audio_file : UploadFile = File(...)):
     if whisper_model is None:
-         raise HTTPException(status_code=503, detail="Transcription service unavailable.")
     try:
         content = await audio_file.read()
-        transcript, info = process_transcription(content, whisper_model)
-        logger.info(f"Transcription successful. Language: {info.language}")
-        return {"transcript": transcript}
     except ValueError as ve:
-         logger.error(f"Value error during transcription processing: {ve}")
-         raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
-        logger.error(f"Unhandled error during transcription: {e}", exc_info=True)
         raise HTTPException(status_code=500, detail="Internal server error during transcription.")
 @app.post("/summarize")
-def summarize(input: TranscriptInput):
     if summarizer_pipeline is None:
          raise HTTPException(status_code=503, detail="Summarization service unavailable.")
     if not input.transcript:
@@ -71,11 +164,45 @@ def summarize(input: TranscriptInput):
     try:
         summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
         return {"summary": summary}
     except ValueError as ve:
          logger.error(f"Value error during summary processing: {ve}")
          raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
         logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail="Internal server error during summarization.")

 import logging
+import traceback
+import os
+import json
+import re
+import time
+from fastapi import FastAPI, UploadFile, File, HTTPException, Form
+from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel
 import config
 from models import load_whisper, load_summarizer, load_spacy
+from services import process_transcription, process_summary, create_enhanced_summary_prompt, format_summary_to_markdown, get_language_name
+from utils import webm_to_wav
+import google.generativeai as genai
+from google.api_core import exceptions as api_core_exceptions
 logger = logging.getLogger(__name__)
 app = FastAPI(
     title="Transcription and Summarization API",
     description="API using Faster-Whisper, spaCy, and Hugging Face Transformers",
+    version="1.0.0",
 )
+api_key = os.getenv("GEMINI_API_KEY")
+if not api_key:
+    logger.critical("GEMINI_API_KEY environment variable not set.")
+else:
+    genai.configure(api_key=api_key)
 logger.info("Application starting up - loading models...")
 whisper_model = load_whisper(config)
 summarizer_pipeline = load_summarizer(config)
 nlp_spacy = load_spacy(config)
 logger.info("Model loading complete.")
+origins = ["http://localhost:8080"]
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=origins,
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
 if not whisper_model:
+    logger.critical(
+        "Whisper model failed to load. Transcription endpoint will be unavailable."
+    )
 if not summarizer_pipeline:
+    logger.critical(
+        "Summarizer pipeline failed to load. Summarization endpoint will be unavailable."
+    )
 if not nlp_spacy:
+    logger.warning(
+        "SpaCy model failed to load. Summarization will proceed without spaCy preprocessing."
+    )
 class TranscriptInput(BaseModel):
     transcript: str
+    language: str = "en"
 @app.get("/health")
 def health():
+    return {
+        "status": "ok",
         "whisper_loaded": whisper_model is not None,
         "summarizer_loaded": summarizer_pipeline is not None,
+        "spacy_loaded": nlp_spacy is not None,
+    }
 @app.post("/transcribe")
+async def transcription(
+    audio_file: UploadFile = File(...),
+    enable_diarization: bool = Form(False)
+):
     if whisper_model is None:
+        raise HTTPException(status_code=503, detail="Transcription service unavailable.")
     try:
+        start_time = time.time()
+        content_type = audio_file.content_type
         content = await audio_file.read()
+        if content_type in ["audio/webm", "video/webm"]:
+            wav_path = webm_to_wav(content)
+            with open(wav_path, "rb") as f:
+                wav_bytes = f.read()
+            os.remove(wav_path)
+        elif content_type == "audio/wav":
+            wav_bytes = content
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported audio format. Use .webm or .wav")
+        transcript, info, diarized_segments = process_transcription(
+            wav_bytes,
+            whisper_model,
+            enable_diarization=enable_diarization
+        )
+        processing_time = time.time() - start_time
+        logger.info(f"Transcription successful. Language: {info.language}, Time: {processing_time:.2f}s")
+        speakers = []
+        if diarized_segments:
+            for segment in diarized_segments:
+                if segment["speaker"] not in speakers:
+                    speakers.append(segment["speaker"])
+        response = {
+            "transcript": transcript,
+            "language": info.language,
+            "duration": info.duration,
+        }
+        if enable_diarization and diarized_segments:
+            response["speakers"] = speakers
+            response["segments"] = diarized_segments
+        return response
+    except HTTPException as http_exc:
+        raise http_exc
     except ValueError as ve:
+        logger.error(f"Value error during transcription processing: {ve}")
+        raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
+        logger.error(f"Unhandled error during transcription: {e}\n{traceback.format_exc()}")
         raise HTTPException(status_code=500, detail="Internal server error during transcription.")
 @app.post("/summarize")
+async def summarize(input: TranscriptInput):
+    if not input.transcript or not input.transcript.strip():
+        raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
+    try:
+        prompt = f"""
+        Summarize the following text concisely:
+        Transcript:
+        \"\"\"
+        {input.transcript}
+        \"\"\"
+        """
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        response = model.generate_content(prompt)
+        logger.info(f"Gemini /summarize response text: '{response.text}'")
+        return {"summary": response.text}
+    except api_core_exceptions.ResourceExhausted as e:
+        logger.error(f"Gemini API rate limit exceeded: {e}")
+        raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
+    except genai.types.BlockedPromptError as e:
+        logger.error(f"The prompt was blocked: {e}")
+        raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
+    except Exception as e:
+        logger.error(f"An unexpected error occurred during basic summarization: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail=str(e))
+@app.post("/smart-summary")
+def smart_summarize(input: TranscriptInput):
     if summarizer_pipeline is None:
          raise HTTPException(status_code=503, detail="Summarization service unavailable.")
     if not input.transcript:
     try:
         summary = process_summary(input.transcript, summarizer_pipeline, nlp_spacy, config)
         return {"summary": summary}
     except ValueError as ve:
          logger.error(f"Value error during summary processing: {ve}")
          raise HTTPException(status_code=400, detail=str(ve))
     except Exception as e:
         logger.error(f"Unhandled error during summarization: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="Internal server error during summarization.")
+@app.post("/enhanced-summary")
+async def enhanced_summary(input: TranscriptInput):
+    if not input.transcript or not input.transcript.strip():
+        raise HTTPException(status_code=400, detail="Transcript cannot be empty.")
+    try:
+        language_name = get_language_name(input.language)
+        prompt = create_enhanced_summary_prompt(input.transcript, language_name)
+        model = genai.GenerativeModel('gemini-1.5-flash')
+        response = model.generate_content(
+            contents=prompt,
+            generation_config=genai.GenerationConfig(response_mime_type="application/json")
+        )
+        try:
+            cleaned_text = re.sub(r"```json\s*(.*)\s*```", r"\1", response.text, flags=re.DOTALL)
+            summary_json = json.loads(cleaned_text)
+            logger.info(f"Received JSON from Gemini: {summary_json}")
+        except (json.JSONDecodeError, TypeError) as e:
+            logger.error(f"Failed to parse LLM response as JSON: {e}\nResponse text: {response.text}")
+            raise HTTPException(status_code=500, detail="Failed to generate a structured summary due to an invalid model response.")
+        formatted_markdown = format_summary_to_markdown(summary_json)
+        logger.info(f"Formatted Markdown: {formatted_markdown}")
+        return {"summary": formatted_markdown}
+    except api_core_exceptions.ResourceExhausted as e:
+        logger.error(f"Gemini API rate limit exceeded: {e}")
+        raise HTTPException(status_code=429, detail="API rate limit exceeded. Please wait and try again.")
+    except genai.types.BlockedPromptError as e:
+        logger.error(f"The prompt was blocked: {e}")
+        raise HTTPException(status_code=400, detail="The request was blocked by the content safety filter.")
+    except Exception as e:
+        logger.error(f"An unexpected error occurred during summarization: {e}", exc_info=True)
+        raise HTTPException(status_code=500, detail="An internal server error occurred during summarization.")

config.py CHANGED Viewed

@@ -1,15 +1,22 @@
-import os
 import psutil
-WHISPER_MODEL_NAME = "Systran/faster-whisper-tiny"
 WHISPER_DEVICE = "cpu"
 WHISPER_COMPUTE_TYPE = "int8"
 PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
 SUMMARIZER_MAX_LENGTH = 150
 SUMMARIZER_MIN_LENGTH = 50
 SPACY_MODEL = "en_core_web_sm"
 CPU_THREADS = max(1, psutil.cpu_count(logical=False))

 import psutil
+import os
+from dotenv import load_dotenv
+import nltk
+load_dotenv()
+WHISPER_MODEL_NAME = "tiny"
 WHISPER_DEVICE = "cpu"
 WHISPER_COMPUTE_TYPE = "int8"
 PYANNOTE_AUTH_TOKEN = os.getenv("HUGGINGFACE_API_KEY")
+#SUMMARIZER_MODEL = "google/flan-t5-base"
 SUMMARIZER_MODEL = "facebook/bart-large-cnn"
 SUMMARIZER_MAX_LENGTH = 150
 SUMMARIZER_MIN_LENGTH = 50
 SPACY_MODEL = "en_core_web_sm"
 CPU_THREADS = max(1, psutil.cpu_count(logical=False))
+if not PYANNOTE_AUTH_TOKEN:
+    raise ValueError("HUGGINGFACE_API_KEY not set in environment variables")

models.py CHANGED Viewed

@@ -1,61 +1,84 @@
-import os
-os.environ['HF_HOME'] = '/tmp/huggingface'
-os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
-os.environ['HUGGINGFACE_HUB_CACHE'] = '/tmp/huggingface'
-from huggingface_hub import snapshot_download
-import spacy
 import logging
 from faster_whisper import WhisperModel
 from transformers import pipeline
 logger = logging.getLogger(__name__)
 def load_whisper(config):
     logger.info("Loading Whisper model...")
     try:
-        cache_dir = "/tmp/hf-cache"
-        os.makedirs(cache_dir, exist_ok=True)
-        model_dir = snapshot_download(
-            repo_id=config.WHISPER_MODEL_NAME,
-            cache_dir=cache_dir,
-            token=os.getenv("HUGGINGFACE_API_KEY")
-        )
         model = WhisperModel(
-            model_dir,
             device=config.WHISPER_DEVICE,
             compute_type=config.WHISPER_COMPUTE_TYPE,
-            cpu_threads=config.CPU_THREADS
         )
-        logger.info(f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded from {model_dir} on {config.WHISPER_DEVICE}.")
         return model
     except Exception as e:
         logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
         return None
 def load_summarizer(config):
     logger.info("Loading Summarization pipeline...")
     try:
-        summarizer = pipeline("summarization", model=config.SUMMARIZER_MODEL, from_tf=True)
         logger.info("Summarization pipeline loaded.")
         return summarizer
     except Exception as e:
         logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
         return None
 def load_spacy(config):
     logger.info("Loading spaCy model...")
     try:
         nlp = spacy.load("en_core_web_sm")
         logger.info("spaCy model 'en_core_web_sm' loaded.")
         return nlp
     except OSError:
@@ -66,9 +89,11 @@ def load_spacy(config):
             nlp = spacy.load("en_core_web_sm")
             logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
             return nlp
         except Exception as download_e:
-            logger.error(f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}")
             return None
     except Exception as e:

 import logging
 from faster_whisper import WhisperModel
+import spacy
 from transformers import pipeline
+import os
+import torch
+from pyannote.audio import Pipeline
 logger = logging.getLogger(__name__)
+_diarize_model = None
 def load_whisper(config):
     logger.info("Loading Whisper model...")
     try:
         model = WhisperModel(
+            config.WHISPER_MODEL_NAME,
             device=config.WHISPER_DEVICE,
             compute_type=config.WHISPER_COMPUTE_TYPE,
+            cpu_threads=config.CPU_THREADS,
+        )
+        logger.info(
+            f"Whisper model '{config.WHISPER_MODEL_NAME}' loaded on {config.WHISPER_DEVICE}."
         )
         return model
     except Exception as e:
         logger.error(f"Failed to load Whisper model: {e}", exc_info=True)
         return None
+def load_diarization(config):
+    global _diarize_model
+    logger.info("Loading PYANNOTE model...")
+    if _diarize_model is None and hasattr(config, "PYANNOTE_AUTH_TOKEN"):
+        try:
+            logger.info("Loading diarization model")
+            _diarize_model = Pipeline.from_pretrained(
+                "pyannote/speaker-diarization-3.0",
+                use_auth_token=config.PYANNOTE_AUTH_TOKEN,
+            )
+            # Move to GPU if available
+            if (
+                hasattr(config, "WHISPER_DEVICE")
+                and config.WHISPER_DEVICE == "cuda"
+                and torch.cuda.is_available()
+            ):
+                _diarize_model = _diarize_model.to(torch.device("cuda"))
+            logger.info("Diarization model loaded successfully")
+        except Exception as e:
+            logger.error(f"Failed to load diarization model: {e}", exc_info=True)
+    return _diarize_model
 def load_summarizer(config):
     logger.info("Loading Summarization pipeline...")
     try:
+        summarizer = pipeline(
+            "text2text-generation",
+            model=config.SUMMARIZER_MODEL,
+            device=0 if torch.cuda.is_available() else -1,
+        )
         logger.info("Summarization pipeline loaded.")
         return summarizer
     except Exception as e:
         logger.error(f"Failed to load Summarization pipeline: {e}", exc_info=True)
         return None
 def load_spacy(config):
     logger.info("Loading spaCy model...")
     try:
         nlp = spacy.load("en_core_web_sm")
         logger.info("spaCy model 'en_core_web_sm' loaded.")
         return nlp
     except OSError:
             nlp = spacy.load("en_core_web_sm")
             logger.info("spaCy model 'en_core_web_sm' downloaded and loaded.")
             return nlp
         except Exception as download_e:
+            logger.error(
+                f"Failed to download or load spaCy model 'en_core_web_sm': {download_e}"
+            )
             return None
     except Exception as e:

requirements.txt CHANGED Viewed

@@ -7,4 +7,8 @@ spacy
 pydub
 psutil
 python-multipart
-en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl

 pydub
 psutil
 python-multipart
+en-core-web-sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl
+pyannote
+nltk
+google.generativeai
+google

services.py CHANGED Viewed

@@ -1,10 +1,30 @@
 import logging
 import os
 import tempfile
 logger = logging.getLogger(__name__)
-def process_transcription(audio_content: bytes, whisper_model):
     if not whisper_model:
         raise ValueError("Whisper model not loaded.")
@@ -14,9 +34,123 @@ def process_transcription(audio_content: bytes, whisper_model):
             temp_file_path = temp_file.name
             temp_file.write(audio_content)
-        segments, info = whisper_model.transcribe(temp_file_path, beam_size=5)
         transcript = " ".join([seg.text.strip() for seg in segments])
-        return transcript, info
     finally:
         if temp_file_path and os.path.exists(temp_file_path):
             os.remove(temp_file_path)
@@ -25,21 +159,299 @@ def process_summary(text: str, summarizer_pipeline, nlp_spacy, config):
     if not summarizer_pipeline:
         raise ValueError("Summarizer model not loaded.")
-    processed_text = text
     if nlp_spacy:
         try:
-            doc = nlp_spacy(text)
             sentences = [sent.text.strip() for sent in doc.sents]
             processed_text = " ".join(sentences)
         except Exception as e:
              logger.error(f"SpaCy processing failed: {e}", exc_info=True)
-    summary_output = summarizer_pipeline(
-        processed_text,
-        max_length=config.SUMMARIZER_MAX_LENGTH,
-        min_length=config.SUMMARIZER_MIN_LENGTH,
-        do_sample=False
-    )
-    final_summary = summary_output[0]['summary_text']
-    return final_summary

 import logging
 import os
 import tempfile
+import nltk
+from nltk.tokenize import sent_tokenize
+from nltk.tokenize.punkt import PunktSentenceTokenizer
+from nltk.data import load
+import pickle
+import re
+from utils import clean_transcript, consolidate_similar_items, chunk_text
+from transformers import pipeline
+import config
+from models import load_diarization
+import wave
+import gc
+import torch
+import time
+import pycountry
+from functools import lru_cache
 logger = logging.getLogger(__name__)
+_diarize_model = None
+def process_transcription(audio_content: bytes, whisper_model, enable_diarization=False):
+    start = time.time()
     if not whisper_model:
         raise ValueError("Whisper model not loaded.")
             temp_file_path = temp_file.name
             temp_file.write(audio_content)
+        segments_gen, info = whisper_model.transcribe(temp_file_path, beam_size=5)
+        segments = list(segments_gen)
         transcript = " ".join([seg.text.strip() for seg in segments])
+        global _diarize_model
+        if not enable_diarization:
+            return transcript, info, None
+        if _diarize_model is None:
+            _diarize_model = load_diarization(config)
+        if _diarize_model is None:
+            logger.warning("Diarization model not available, returning transcript without speakers")
+            return transcript, info, None
+        with wave.open(temp_file_path, 'rb') as wav:
+            frames = wav.getnframes()
+            rate = wav.getframerate()
+            #calcul audio duration
+            audio_duration = frames / float(rate)
+        if audio_duration < 3.0:
+            logger.info(f"Audio too short ({audio_duration:.2f}s), skipping diarization")
+            diarized_segments = [{"speaker": "SPEAKER_0", "text": transcript}]
+            diarized_transcript = f"[SPEAKER_0]: {transcript}"
+            return diarized_transcript, info, diarized_segments
+        logger.info("Running speaker diarization")
+        diarization = _diarize_model(temp_file_path)
+        # Extract diarization segments
+        diarize_segments = []
+        for turn, _, speaker in diarization.itertracks(yield_label=True):
+            diarize_segments.append({
+                "speaker": f"SPEAKER_{speaker.replace('SPEAKER_', '')}",
+                "start": turn.start,
+                "end": turn.end
+            })
+        diarized_segments = []
+        for segment in segments:
+            # Find best matching speaker based on time overlap
+            best_speaker = None
+            max_overlap = 0
+            seg_start = segment.start
+            seg_end = segment.end
+            for diar_seg in diarize_segments:
+                diar_start = diar_seg["start"]
+                diar_end = diar_seg["end"]
+                # Calculate overlap
+                overlap_start = max(seg_start, diar_start)
+                overlap_end = min(seg_end, diar_end)
+                if overlap_end > overlap_start:
+                    overlap = overlap_end - overlap_start
+                    if overlap > max_overlap:
+                        max_overlap = overlap
+                        best_speaker = diar_seg["speaker"]
+            # If no overlap found, assign to the closest speaker
+            if best_speaker is None:
+                min_distance = float('inf')
+                for diar_seg in diarize_segments:
+                    # Distance to start of segment
+                    dist_start = abs(seg_start - diar_seg["start"])
+                    # Distance to end of segment
+                    dist_end = abs(seg_end - diar_seg["end"])
+                    # Take the minimum
+                    dist = min(dist_start, dist_end)
+                    if dist < min_distance:
+                        min_distance = dist
+                        best_speaker = diar_seg["speaker"]
+            diarized_segments.append({
+                "speaker": best_speaker or "SPEAKER_UNKNOWN",
+                "text": segment.text,
+                "start": segment.start,
+                "end": segment.end
+            })
+        # Format diarized transcript
+        diarized_transcript = ""
+        current_speaker = None
+        for segment in diarized_segments:
+            speaker = segment["speaker"]
+            text = segment["text"].strip()
+            if not text:
+                continue
+            if speaker != current_speaker:
+                diarized_transcript += f"\n[{speaker}]: {text}"
+                current_speaker = speaker
+            else:
+                diarized_transcript += f" {text}"
+        # Clean up memory
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+        end = time.time()
+        logger.info("time : ", (end - start) * 10**3)
+        return diarized_transcript, info, diarized_segments
     finally:
         if temp_file_path and os.path.exists(temp_file_path):
             os.remove(temp_file_path)
     if not summarizer_pipeline:
         raise ValueError("Summarizer model not loaded.")
+    #clean transcript
+    cleaned_transcript = clean_transcript(text)
+    processed_text = cleaned_transcript
+    doc = None
     if nlp_spacy:
         try:
+            doc = nlp_spacy(processed_text)
             sentences = [sent.text.strip() for sent in doc.sents]
             processed_text = " ".join(sentences)
         except Exception as e:
              logger.error(f"SpaCy processing failed: {e}", exc_info=True)
+    categories = {
+        "meeting_title": [],
+        "intro": [],
+        "topics": [],
+        "decisions": [],
+        "action_items": [],
+        "questions": [],
+        "deadlines": [],
+        "participants": [],
+        "overall_summary": [],
+        "conclusion": []
+    }
+    # extraction meeting title
+    title_pattern = r'(meeting|call|session|discussion) (about|on|for|regarding) ([^.]+)'
+    title_matches = re.findall(title_pattern, processed_text, re.IGNORECASE)
+    if title_matches:
+        categories["meeting_title"].append(title_matches[0][2].strip())
+    if doc:
+        sentences = [sent.text.strip() for sent in doc.sents]
+    else:
+        try:
+            with open("/home/heymouad/nltk_data/tokenizers/punkt/english.pickle", "rb") as f:
+                tokenizer = pickle.load(f)
+            sentences = tokenizer.tokenize(processed_text)
+        except Exception as e:
+            logger.error(f"NLTK tokenization failed: {e}", exc_info=True)
+            sentences = sent_tokenize(processed_text)
+    # Find participants
+    people = set()
+    if doc:
+        for ent in doc.ents:
+            if ent.label_ == "PERSON":
+                person = ent.text.strip()
+                if len(person) > 2:
+                    people.add(person)
+    if people:
+        categories["participants"] = list(people)
+    try:
+        # chunked the text because of limits of bart model
+        logger.info(processed_text[::100])
+        processed_text = chunk_text(processed_text)
+        parts_summaries = []
+        for chunk in processed_text:
+            result = summarizer_pipeline(chunk, max_length=150, min_length=30, do_sample=False)
+            if result and isinstance(result, list) and len(result) > 0:
+                part_summary = result[0].get('summary_text', '')
+                if part_summary:
+                    parts_summaries.append(part_summary)
+        overall_summary = " ".join(parts_summaries)
+        overall_summary = summarizer_pipeline(overall_summary, max_length=150, min_length=30, do_sample=False)[0]['summary_text']
+        categories["overall_summary"] = [overall_summary]
+    except Exception as e:
+        logger.error(f"Summarization failed: {e}", exc_info=True)
+        categories["overall_summary"] = ["Failed to generate overall summary."]
+    # Process each sentence
+    for i, sentence in enumerate(sentences):
+        sentence = sentence.strip()
+        if not sentence:
+            continue
+        # Check for action items
+        if (re.search(r'(need to|will|shall|must|should|have to|assigned to|responsible for|task|action item|to-do|follow up|take care of)',
+                      sentence, re.IGNORECASE) and
+            re.search(r'(we|you|I|they|he|she|team|group|department)', sentence, re.IGNORECASE)):
+            categories["action_items"].append(sentence)
+            continue
+        # Check for decisions
+        if re.search(r'(decided|agreed|conclusion|resolved|approved|rejected|consensus|finalized|confirmed|determined)',
+                    sentence, re.IGNORECASE):
+            categories["decisions"].append(sentence)
+            continue
+        # Check for deadlines/timing with stronger patterns
+        if re.search(r'(by|due|deadline|schedule|date|tomorrow|next week|month|calendar|remind|upcoming|on|at|until)',
+                    sentence, re.IGNORECASE) and re.search(r'(time|day|week|month|year|hour|minute)', sentence, re.IGNORECASE):
+            categories["deadlines"].append(sentence)
+            continue
+        # Check for questions/issues
+        if (re.search(r'(\?|issue|problem|concern|question|clarif|wonder|how|what|when|where|why|who)',
+                     sentence, re.IGNORECASE) and
+            not re.search(r'(answer|answered|resolved|solved)', sentence, re.IGNORECASE)):
+            categories["questions"].append(sentence)
+            continue
+        # Check for intro statements
+        if i < len(sentences) // 10:  # First 10% of sentences
+            if re.search(r'(welcome|begin|start|agenda|today|discuss|meeting|introduce|opening|good morning|hello|topic)',
+                        sentence, re.IGNORECASE):
+                categories["intro"].append(sentence)
+                continue
+        # Check for conclusion statements
+        if i > len(sentences) * 9 // 10:  # Last 10% of sentences
+            if re.search(r'(conclude|end|wrap|summary|thank|next meeting|follow up|adjourn|goodbye|bye|closing)',
+                        sentence, re.IGNORECASE):
+                categories["conclusion"].append(sentence)
+                continue
+    # Everything else is considered a topic if it has substance
+    if len(sentence.split()) > 3:  # Avoid very short sentences
+        categories["topics"].append(sentence)
+    # Process categories to avoid repetition and consolidate related points
+    for category in categories:
+        if category in ["topics", "action_items", "decisions", "questions", "deadlines"]:
+            categories[category] = consolidate_similar_items(categories[category])
+    # Limit the number of topics to avoid overwhelming
+    if len(categories["topics"]) > 10:
+        # If we have a summarizer, try to generate a summary of topics
+        try:
+            topics_text = " ".join(categories["topics"])
+            topics_summary = summarizer_pipeline(topics_text, max_length=200, min_length=50, do_sample=False)[0]['summary_text']
+            categories["topics"] = sent_tokenize(topics_summary)
+        except Exception as e:
+            logger.error(f"Topics summarization failed: {e}", exc_info=True)
+            # Otherwise just take the first few and last few topics
+            categories["topics"] = categories["topics"][:5] + categories["topics"][-5:]
+    # Add emojis to formatted output
+    formatted_summary = []
+   # Format meeting title if available
+    if categories.get("meeting_title"):
+        formatted_summary.append(f"📝 **Meeting Title:** {categories['meeting_title'][0]}")
+        formatted_summary.append("")
+    # Add overall summary
+    if categories.get("overall_summary"):
+        formatted_summary.append("📋 **Executive Summary:**")
+        formatted_summary.append(categories["overall_summary"][0])
+        formatted_summary.append("")
+    # Format participants
+    if categories["participants"]:
+        formatted_summary.append("👥 **Participants:**")
+        formatted_summary.append(", ".join(categories["participants"]))
+        formatted_summary.append("")
+    # Format intro
+    if categories["intro"]:
+        formatted_summary.append("🎯 **Meeting Introduction:**")
+        formatted_summary.append(" ".join(categories["intro"]))
+        formatted_summary.append("")
+    # Format main topics
+    if categories["topics"]:
+        formatted_summary.append("💡 **Key Topics:**")
+        for i, topic in enumerate(categories["topics"], 1):
+            formatted_summary.append(f"{i}. {topic}")
+        formatted_summary.append("")
+    # Format decisions
+    if categories["decisions"]:
+        formatted_summary.append("✅ **Decisions Made:**")
+        for decision in categories["decisions"]:
+            formatted_summary.append(f"• {decision}")
+        formatted_summary.append("")
+    # Format action items
+    if categories["action_items"]:
+        formatted_summary.append("📋 **Action Items:**")
+        for item in categories["action_items"]:
+            formatted_summary.append(f"• {item}")
+        formatted_summary.append("")
+    # Format questions
+    if categories["questions"]:
+        formatted_summary.append("❓ **Questions & Concerns:**")
+        for question in categories["questions"]:
+            formatted_summary.append(f"• {question}")
+        formatted_summary.append("")
+    # Format deadlines
+    if categories["deadlines"]:
+        formatted_summary.append("⏰ **Deadlines & Timing:**")
+        for deadline in categories["deadlines"]:
+            formatted_summary.append(f"• {deadline}")
+        formatted_summary.append("")
+    # Format conclusion
+    if categories["conclusion"]:
+        formatted_summary.append("🏁 **Conclusion:**")
+        formatted_summary.append(" ".join(categories["conclusion"]))
+    return "\n".join(formatted_summary)
+def create_enhanced_summary_prompt(transcript: str, language_name: str) -> str:
+    """
+    Creates a single, dynamic and insistent prompt that instructs the AI
+    to output its findings in the specified language.
+    """
+    return f"""
+    You are an expert AI assistant. Your task is to analyze the following meeting transcript and extract key information into a structured JSON object.
+    **Primary Goal:** Analyze the provided transcript and generate a structured summary.
+    **CRITICAL LANGUAGE INSTRUCTION:** All text in your final JSON response must be written in the following language: **{language_name}**. There are no exceptions.
+    **ANALYSIS INSTRUCTIONS:**
+    1.  Read the entire transcript to understand its context.
+    2.  Identify a concise title for the meeting.
+    3.  Identify all participants mentioned.
+    4.  Write a brief paragraph summarizing the core themes and outcomes.
+    5.  List all clear and agreed-upon decisions.
+    6.  Extract all clear action items, identifying the task, who it was assigned to, the due date if mentioned, and the context.
+    **OUTPUT INSTRUCTIONS:**
+    - Respond ONLY with a valid JSON object.
+    - The JSON must use these exact keys: "meeting_title", "participants", "meeting_summary", "decisions_made", "action_items".
+    - **Language Check:** Before you finalize your response, verify that every single string value within the JSON is written in **{language_name}**.
+    **TRANSCRIPT TO ANALYZE:**
+    \"\"\"
+    {transcript}
+    \"\"\"
+    """
+@lru_cache(maxsize=None)
+def get_language_name(language_code: str) -> str:
+    """Converts a two-letter language code (e.g., 'es') to its full name (e.g., 'Spanish')."""
+    try:
+        lang = pycountry.languages.get(alpha_2=language_code)
+        return lang.name if lang else language_code
+    except Exception:
+        return language_code
+def format_summary_to_markdown(summary_json: dict) -> str:
+    """Converts the structured JSON summary into a formatted Markdown string."""
+    summary_data = {k.lower().replace(" ", "_"): v for k, v in summary_json.items()}
+    if not summary_data.get("meeting_summary") and not summary_data.get("decisions_made") and not summary_data.get("action_items"):
+        return "The provided transcript was too short or lacked sufficient content to generate a detailed summary."
+    markdown_parts = []
+    if title := summary_data.get("meeting_title"):
+        markdown_parts.append(f"### {title}\n")
+    if summary := summary_data.get("meeting_summary"):
+        markdown_parts.append("📝 **Meeting Summary:**")
+        markdown_parts.append(summary)
+        markdown_parts.append("")
+    if decisions := summary_data.get("decisions_made"):
+        markdown_parts.append("📌 **Decisions Made:**")
+        for decision in decisions:
+            markdown_parts.append(f"- {decision}")
+        markdown_parts.append("")
+    if action_items := summary_data.get("action_items"):
+        markdown_parts.append("✅ **Action Items:**")
+        for item in action_items:
+            task = item.get('task', item.get('Task', 'N/A'))
+            assigned_to = item.get('assigned_to', item.get('Assigned To', 'Not specified'))
+            due_date = item.get('due_date', item.get('Due Date', 'Not specified'))
+            context = item.get('context', item.get('Context', ''))
+            markdown_parts.append(f"- **Task**: {task}")
+            markdown_parts.append(f"  - **Assigned To**: {assigned_to}")
+            markdown_parts.append(f"  - **Due Date**: {due_date}")
+            if context:
+                markdown_parts.append(f"  - **Context**: {context}")
+        markdown_parts.append("")
+    return "\n".join(markdown_parts)

utils.py CHANGED Viewed

	@@ -0,0 +1,96 @@

+import re
+import subprocess
+import tempfile
+import os
+def clean_transcript(text):
+    """Clean the transcript by removing filler words and consolidating sentences."""
+    filler_words = [
+        r'\bum\b', r'\buh\b', r'\blike\b', r'\byou know\b', r'\bkind of\b',
+        r'\bsort of\b', r'\bI mean\b', r'\bbasically\b', r'\bactually\b',
+        r'\bso\b', r'\banyway\b', r'\blike\b', r'\bjust\b'
+    ]
+    for word in filler_words:
+        text = re.sub(f"{word}", "", text, flags=re.IGNORECASE)
+    text = re.sub(r'\s+', ' ', text)
+    return text.strip()
+def consolidate_similar_items(items):
+    """Consolidate similar items to reduce repetition."""
+    if not items or len(items) <= 1:
+        return items
+    # Simple similarity measure based on word overlap
+    result = [items[0]]
+    for item in items[1:]:
+        # Convert to sets of words for comparison
+        item_words = set(item.lower().split())
+        # Check if this item is too similar to any existing item
+        too_similar = False
+        for existing_item in result:
+            existing_words = set(existing_item.lower().split())
+            # Calculate Jaccard similarity
+            intersection = len(item_words.intersection(existing_words))
+            union = len(item_words.union(existing_words))
+            if union > 0 and intersection / union > 0.6:  # 60% similarity threshold
+                too_similar = True
+                break
+        if not too_similar:
+            result.append(item)
+    return result
+def chunk_text(text, max_tokens=800):
+    words = text.split()
+    chunks = []
+    if len(words) > max_tokens:
+        for i in range(0, len(words), max_tokens):
+            chunk = " ".join(words[i:i + max_tokens])
+            chunks.append(chunk)
+        return chunks
+    else:
+        return [text]
+def webm_to_wav(webm_bytes: bytes) -> str:
+    """
+    Converts webm audio bytes to a wav file and returns the path to the wav file.
+    """
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".webm") as webm_file:
+        webm_path = webm_file.name
+        webm_file.write(webm_bytes)
+    wav_path = webm_path.replace(".webm", ".wav")
+    try:
+        subprocess.run([
+            "ffmpeg", "-y", "-i", webm_path, "-ar", "16000", "-ac", "1", wav_path
+        ], check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+    finally:
+        os.remove(webm_path)
+    return wav_path
+def preprocess_transcript(transcript: str) -> str:
+    """
+    Cleans and normalizes the transcript.
+    - Removes extra whitespace.
+    - Can be expanded to handle speaker diarization, e.g., "Speaker A:" -> "Alice:"
+    """
+    # Simple cleaning
+    text = re.sub(r'\s+', ' ', transcript).strip()
+    # Advanced: A future step could be to normalize speaker names
+    # e.g., mapping "Bob's update" to "Bob:"
+    return text