Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 9

Commit

2001581

verified ·

1 Parent(s): b649976

Update main.py

Browse files

Files changed (1) hide show

main.py +132 -177

main.py CHANGED Viewed

@@ -1,32 +1,11 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.staticfiles import StaticFiles
-from transformers import pipeline, AutoTokenizer, AutoModelForQuestionAnswering
-from sentence_transformers import SentenceTransformer, util
-from typing import Optional
-import io
-import fitz  # PyMuPDF
-from PIL import Image
-import pandas as pd
-import uvicorn
-from functools import lru_cache
-from docx import Document
-from pptx import Presentation
-import pytesseract
-import torch
-from typing import Dict
-from transformers import Pipeline
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, status
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
 from fastapi.encoders import jsonable_encoder
 from transformers import pipeline, Pipeline
-from typing import Dict, Optional, Tuple
 from pydantic import BaseModel, constr, validator
 import io
 import fitz  # PyMuPDF
@@ -41,74 +20,13 @@ import os
 from datetime import datetime
 from pathlib import Path
 import re
-from fastapi.responses import HTMLResponse
-from fastapi.templating import Jinja2Templates
-from fastapi import Request
-from pathlib import Path
-import os
-print(os.getcwd())  # This prints the current working directory
-# Initialize FastAPI app
-app = FastAPI()
-print(os.getcwd())
-templates = Jinja2Templates(directory=str(Path(__file__).parent / "templates"))
-# Configure CORS
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=[
-        "https://*.hf.space",
-        "http://localhost",
-        "http://localhost:8000"
-    ],
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# Serve static files (frontend)
-app.mount("/static", StaticFiles(directory="static"), name="static")
-# Model loading with caching
-@lru_cache()
-def get_summarizer():
-    return pipeline("summarization", model="facebook/bart-large-cnn")
-@lru_cache()
-def get_image_captioning():
-    return pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
-@lru_cache()
-def get_translator():
-    return pipeline("translation", model="facebook/nllb-200-distilled-600M")
-@lru_cache()
-def get_qa_model():
-    return pipeline("question-answering", model="deepset/roberta-base-squad2")
-#########################################################
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 app = FastAPI(
     title="AI Document Analysis API",
     description="Advanced document processing with multilingual support",
@@ -117,7 +35,7 @@ app = FastAPI(
     redoc_url="/redoc"
 )
-# CORS Configuration
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -126,6 +44,12 @@ app.add_middleware(
     allow_headers=["*"],
 )
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 MAX_TEXT_LENGTH = 2000
@@ -195,7 +119,7 @@ class ErrorResponse(BaseModel):
 # Exception Handler
 @app.exception_handler(HTTPException)
-async def http_exception_handler(request, exc):
     error_response = ErrorResponse(
         error=exc.detail,
         status_code=exc.status_code,
@@ -282,18 +206,67 @@ def preprocess_text(text: str) -> str:
     text = re.sub(r'\s+', ' ', text).strip()
     return text[:MAX_TEXT_LENGTH] if len(text) > MAX_TEXT_LENGTH else text
 @app.post("/qa")
 async def question_answering(
     file: UploadFile = File(...),
     question: str = Form(...),
     language: str = Form(DEFAULT_LANGUAGE)
-) -> JSONResponse:
     try:
-        # Validation et extraction du texte
         file_ext, content = await validate_and_read_file(file)
         text = preprocess_text(extract_text(content, file_ext))
-        # Détection spéciale pour les questions sur le thème
         theme_keywords = {
             "fr": ["thème", "sujet principal", "quoi le sujet"],
             "en": ["theme", "main topic", "what is about"]
@@ -305,15 +278,17 @@ async def question_answering(
         )
         if is_theme_question:
-            # Utilisation d'un prompt spécialisé pour l'analyse thématique
             theme_prompt = (
-                "Extrayez le thème principal de ce texte en 1-2 phrases. "
-                "Répondez comme si vous expliquiez à un novice. "
-                "Texte : {text}"
             )
-            # Utilisation d'un LLM plus puissant pour l'analyse thématique
-            generator = get_model("moussaKam/barthez-orangesum-abstract", "text-generation")
             response = generator(
                 theme_prompt.format(text=text[:2000]),
                 max_length=200,
@@ -321,43 +296,43 @@ async def question_answering(
                 do_sample=False
             )
-            # Nettoyage de la réponse
             theme = response[0]["generated_text"].split(":")[-1].strip()
-            theme = re.sub(r"^(Le|La)\s+", "", theme)  # Retire les articles en début de phrase
-            return JSONResponse({
                 "question": question,
-                "answer": f"Le document traite principalement de : {theme}",
-                "confidence": 0.95,  # Haut confiance car méthode spécialisée
                 "language": language,
                 "processing_method": "theme_analysis",
                 "success": True
-            })
-        # ... reste du code pour les questions normales ...
-        # ... reste du code pour les questions normales ...
         # Standard QA processing
-        result = qa_model(question=request.question, context=clean_text)
-        if result["score"] < 0.1:  # Low confidence threshold
-            return JSONResponse({
-                "question": request.question,
                 "answer": "No clear answer found in the document" if language == "en" else "Aucune réponse claire trouvée dans le document",
                 "confidence": result["score"],
                 "language": language,
                 "warning": "low_confidence",
                 "success": True
-            })
-        return JSONResponse({
-            "question": request.question,
             "answer": result["answer"],
             "confidence": result["score"],
             "language": language,
             "success": True
-        })
     except HTTPException:
         raise
@@ -369,66 +344,34 @@ async def question_answering(
             details={"error": str(e)}
         )
-    ########################################################
-@app.get("/", response_class=HTMLResponse)
-def home ():
-    with open("static/indexAI.html","r") as file :
-        return file.read()
-# API Endpoints
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy"}
-@app.post("/summarize")
-async def summarize_document(file: UploadFile = File(...)):
-    try:
-        content = await file.read()
-        file_ext = file.filename.split(".")[-1].lower()
-        text = ""
-        if file_ext == "docx":
-            doc = Document(io.BytesIO(content))
-            text = " ".join([p.text for p in doc.paragraphs if p.text.strip()])
-        elif file_ext in ["xls", "xlsx"]:
-            df = pd.read_excel(io.BytesIO(content))
-            text = " ".join(df.iloc[:, 0].dropna().astype(str).tolist())
-        elif file_ext == "pptx":
-            ppt = Presentation(io.BytesIO(content))
-            text = " ".join([shape.text for slide in ppt.slides for shape in slide.shapes if hasattr(shape, "text")])
-        elif file_ext == "pdf":
-            pdf = fitz.open(stream=content, filetype="pdf")
-            text = " ".join([page.get_text("text") for page in pdf])
-        elif file_ext in ["jpg", "jpeg", "png"]:
-            image = Image.open(io.BytesIO(content))
-            text = get_image_captioning()(image)[0]['generated_text']
-        else:
-            raise HTTPException(400, "Unsupported file format")
-        if not text.strip():
-            raise HTTPException(400, "No extractable text found")
-        summarizer = get_summarizer()
-        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-        summary = " ".join([
-            summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
-            for chunk in chunks
-        ])
-        return {"summary": summary}
-    except Exception as e:
-        raise HTTPException(500, f"Error processing document: {str(e)}")
-#################################################################
-###############################################
 @app.post("/api/caption")
 async def caption_image(file: UploadFile = File(...)):
     try:
-        image = Image.open(io.BytesIO(await file.read())).convert("RGB")
-        caption = get_image_captioning()(image)[0]['generated_text']
-        return {"caption": caption}
     except Exception as e:
-        raise HTTPException(500, f"Error processing image: {str(e)}")
 @app.post("/translate")
 async def translate_text(
@@ -437,10 +380,22 @@ async def translate_text(
     src_lang: str = Form("eng_Latn")
 ):
     try:
-        translated = get_translator()(text, src_lang=src_lang, tgt_lang=target_lang)
-        return {"translated_text": translated[0]["translation_text"]}
     except Exception as e:
-        raise HTTPException(500, f"Error translating text: {str(e)}")
 # Run the application
 if __name__ == "__main__":

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, status, Request
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse, HTMLResponse
 from fastapi.encoders import jsonable_encoder
+from fastapi.staticfiles import StaticFiles
+from fastapi.templating import Jinja2Templates
 from transformers import pipeline, Pipeline
+from typing import Dict, Optional, Tuple, List
 from pydantic import BaseModel, constr, validator
 import io
 import fitz  # PyMuPDF
 from datetime import datetime
 from pathlib import Path
 import re
+import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Initialize FastAPI app
 app = FastAPI(
     title="AI Document Analysis API",
     description="Advanced document processing with multilingual support",
     redoc_url="/redoc"
 )
+# Configure CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Set up templates
+templates = Jinja2Templates(directory=str(Path(__file__).parent / "templates"))
+# Serve static files
+app.mount("/static", StaticFiles(directory="static"), name="static")
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 MAX_TEXT_LENGTH = 2000
 # Exception Handler
 @app.exception_handler(HTTPException)
+async def http_exception_handler(request: Request, exc: HTTPException):
     error_response = ErrorResponse(
         error=exc.detail,
         status_code=exc.status_code,
     text = re.sub(r'\s+', ' ', text).strip()
     return text[:MAX_TEXT_LENGTH] if len(text) > MAX_TEXT_LENGTH else text
+def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
+    """Split text into chunks for processing."""
+    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
+# API Endpoints
+@app.get("/", response_class=HTMLResponse)
+async def home(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.get("/health")
+async def health_check():
+    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
+@app.post("/summarize")
+async def summarize_document(file: UploadFile = File(...)):
+    try:
+        file_ext, content = await validate_and_read_file(file)
+        text = preprocess_text(extract_text(content, file_ext))
+        if not text.strip():
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="No extractable text found in document"
+            )
+        model_name = MODEL_MAPPING.get("en", {}).get("summarization", "facebook/bart-large-cnn")
+        summarizer = get_model(model_name, "summarization")
+        chunks = chunk_text(text)
+        summaries = []
+        for chunk in chunks:
+            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
+            summaries.append(summary)
+        return {
+            "success": True,
+            "summary": " ".join(summaries),
+            "language": "en",
+            "processed_chunks": len(chunks)
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Document summarization failed",
+            details={"error": str(e)}
+        )
 @app.post("/qa")
 async def question_answering(
     file: UploadFile = File(...),
     question: str = Form(...),
     language: str = Form(DEFAULT_LANGUAGE)
+):
     try:
         file_ext, content = await validate_and_read_file(file)
         text = preprocess_text(extract_text(content, file_ext))
+        # Theme detection
         theme_keywords = {
             "fr": ["thème", "sujet principal", "quoi le sujet"],
             "en": ["theme", "main topic", "what is about"]
         )
         if is_theme_question:
+            model_name = MODEL_MAPPING.get(language, {}).get("summarization")
+            if not model_name:
+                model_name = MODEL_MAPPING["default"].get("summarization")
+            generator = get_model(model_name, "text-generation")
             theme_prompt = (
+                "Extract the main theme of this text in 1-2 sentences. "
+                "Respond as if explaining to a beginner. "
+                "Text: {text}"
             )
             response = generator(
                 theme_prompt.format(text=text[:2000]),
                 max_length=200,
                 do_sample=False
             )
             theme = response[0]["generated_text"].split(":")[-1].strip()
+            theme = re.sub(r"^(Le|La)\s+", "", theme)
+            return {
                 "question": question,
+                "answer": f"The document mainly discusses: {theme}",
+                "confidence": 0.95,
                 "language": language,
                 "processing_method": "theme_analysis",
                 "success": True
+            }
         # Standard QA processing
+        model_name = MODEL_MAPPING.get(language, {}).get("qa")
+        if not model_name:
+            model_name = MODEL_MAPPING["default"].get("qa")
+        qa_model = get_model(model_name, "question-answering")
+        result = qa_model(question=question, context=text)
+        if result["score"] < 0.1:
+            return {
+                "question": question,
                 "answer": "No clear answer found in the document" if language == "en" else "Aucune réponse claire trouvée dans le document",
                 "confidence": result["score"],
                 "language": language,
                 "warning": "low_confidence",
                 "success": True
+            }
+        return {
+            "question": question,
             "answer": result["answer"],
             "confidence": result["score"],
             "language": language,
             "success": True
+        }
     except HTTPException:
         raise
             details={"error": str(e)}
         )
 @app.post("/api/caption")
 async def caption_image(file: UploadFile = File(...)):
     try:
+        file_ext, content = await validate_and_read_file(file)
+        if file_ext not in {"jpg", "jpeg", "png"}:
+            raise HTTPException(
+                status_code=status.HTTP_400_BAD_REQUEST,
+                detail="Only image files are supported for captioning"
+            )
+        image = Image.open(io.BytesIO(content)).convert("RGB")
+        captioner = get_model(MODEL_MAPPING["default"]["image_captioning"], "image-to-text")
+        caption = captioner(image)[0]['generated_text']
+        return {
+            "success": True,
+            "caption": caption,
+            "file_type": file_ext
+        }
+    except HTTPException:
+        raise
     except Exception as e:
+        logger.error(f"Image captioning failed: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Image captioning failed",
+            details={"error": str(e)}
+        )
 @app.post("/translate")
 async def translate_text(
     src_lang: str = Form("eng_Latn")
 ):
     try:
+        translator = get_model(MODEL_MAPPING["default"]["multilingual_translation"], "translation")
+        translated = translator(text, src_lang=src_lang, tgt_lang=target_lang)
+        return {
+            "success": True,
+            "translated_text": translated[0]["translation_text"],
+            "source_language": src_lang,
+            "target_language": target_lang
+        }
     except Exception as e:
+        logger.error(f"Translation failed: {str(e)}")
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail="Text translation failed",
+            details={"error": str(e)}
+        )
 # Run the application
 if __name__ == "__main__":