Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 9

Commit

d728ee4

verified ·

1 Parent(s): 2001581

Update main.py

Browse files

Files changed (1) hide show

main.py +78 -300

main.py CHANGED Viewed

@@ -1,12 +1,8 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, status, Request
 from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse, HTMLResponse
-from fastapi.encoders import jsonable_encoder
-from fastapi.staticfiles import StaticFiles
-from fastapi.templating import Jinja2Templates
-from transformers import pipeline, Pipeline
-from typing import Dict, Optional, Tuple, List
-from pydantic import BaseModel, constr, validator
 import io
 import fitz  # PyMuPDF
 from PIL import Image
@@ -16,161 +12,66 @@ from docx import Document
 from pptx import Presentation
 import pytesseract
 import logging
-import os
-from datetime import datetime
-from pathlib import Path
 import re
-import torch
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-# Initialize FastAPI app
-app = FastAPI(
-    title="AI Document Analysis API",
-    description="Advanced document processing with multilingual support",
-    version="2.0.0",
-    docs_url="/docs",
-    redoc_url="/redoc"
-)
-# Configure CORS
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
-    allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
-# Set up templates
-templates = Jinja2Templates(directory=str(Path(__file__).parent / "templates"))
-# Serve static files
-app.mount("/static", StaticFiles(directory="static"), name="static")
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
-MAX_TEXT_LENGTH = 2000
-MAX_QUESTION_LENGTH = 500
-MIN_QUESTION_LENGTH = 3
-SUPPORTED_LANGUAGES = {"fr", "en", "es", "de"}
-DEFAULT_LANGUAGE = "fr"
 SUPPORTED_FILE_TYPES = {
-    "docx": "Word Document",
-    "xlsx": "Excel Spreadsheet",
-    "pptx": "PowerPoint Presentation",
-    "pdf": "PDF Document",
-    "jpg": "JPEG Image",
-    "jpeg": "JPEG Image",
-    "png": "PNG Image"
-}
-MODEL_MAPPING = {
-    "fr": {
-        "qa": "illuin/camembert-base-fquad",
-        "summarization": "moussaKam/barthez-orangesum-abstract",
-        "translation": "Helsinki-NLP/opus-mt-fr-en"
-    },
-    "en": {
-        "qa": "deepset/roberta-base-squad2",
-        "summarization": "facebook/bart-large-cnn",
-        "translation": "Helsinki-NLP/opus-mt-en-fr"
-    },
-    "default": {
-        "image_captioning": "Salesforce/blip-image-captioning-large",
-        "multilingual_translation": "facebook/nllb-200-distilled-600M"
-    }
 }
-# Models cache
-models_cache: Dict[str, Pipeline] = {}
-# Pydantic Models
-class TranslationRequest(BaseModel):
-    text: constr(min_length=1, max_length=5000)
-    target_lang: constr(min_length=2, max_length=5)
-    src_lang: Optional[constr(min_length=2, max_length=5)] = None
-    @validator('target_lang', 'src_lang')
-    def validate_language_code(cls, v):
-        if v and len(v) not in {2, 5}:
-            raise ValueError("Language code must be 2 or 5 characters")
-        return v
-class QARequest(BaseModel):
-    question: constr(min_length=MIN_QUESTION_LENGTH, max_length=MAX_QUESTION_LENGTH)
-    language: constr(min_length=2, max_length=2) = DEFAULT_LANGUAGE
-    @validator('language')
-    def validate_language(cls, v):
-        if v.lower() not in SUPPORTED_LANGUAGES:
-            raise ValueError(f"Unsupported language. Supported: {SUPPORTED_LANGUAGES}")
-        return v.lower()
-class ErrorResponse(BaseModel):
-    error: str
-    success: bool = False
-    status_code: int
-    timestamp: str
-    details: Optional[dict] = None
-# Exception Handler
-@app.exception_handler(HTTPException)
-async def http_exception_handler(request: Request, exc: HTTPException):
-    error_response = ErrorResponse(
-        error=exc.detail,
-        status_code=exc.status_code,
-        timestamp=datetime.now().isoformat(),
-        details=getattr(exc, 'details', None)
-    )
-    return JSONResponse(
-        status_code=exc.status_code,
-        content=jsonable_encoder(error_response)
-    )
-# Helper Functions
-def get_model(model_name: str, task: str) -> Pipeline:
-    """Get or load a Hugging Face model with caching."""
-    cache_key = f"{model_name}_{task}"
-    if cache_key not in models_cache:
-        try:
-            logger.info(f"Loading model: {model_name} for task: {task}")
-            models_cache[cache_key] = pipeline(task, model=model_name)
-        except Exception as e:
-            logger.error(f"Model loading failed: {str(e)}")
-            raise HTTPException(
-                status_code=status.HTTP_503_SERVICE_UNAVAILABLE,
-                detail="Model service unavailable",
-                details={"model": model_name, "error": str(e)}
-            )
-    return models_cache[cache_key]
-async def validate_and_read_file(file: UploadFile) -> Tuple[str, bytes]:
-    """Validate and read uploaded file."""
-    # Check file extension
-    file_ext = Path(file.filename).suffix[1:].lower()
     if file_ext not in SUPPORTED_FILE_TYPES:
-        raise HTTPException(
-            status_code=status.HTTP_400_BAD_REQUEST,
-            detail=f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES.values())}"
-        )
-    # Read and check file size
     content = await file.read()
     if len(content) > MAX_FILE_SIZE:
-        raise HTTPException(
-            status_code=status.HTTP_413_REQUEST_ENTITY_TOO_LARGE,
-            detail=f"File exceeds maximum size of {MAX_FILE_SIZE//1024//1024}MB"
-        )
-    await file.seek(0)
     return file_ext, content
 def extract_text(content: bytes, file_ext: str) -> str:
-    """Extract text from various file formats."""
     try:
         if file_ext == "docx":
             doc = Document(io.BytesIO(content))
@@ -187,7 +88,12 @@ def extract_text(content: bytes, file_ext: str) -> str:
         elif file_ext == "pdf":
             pdf = fitz.open(stream=content, filetype="pdf")
-            return " ".join(page.get_text("text") for page in pdf)
         elif file_ext in {"jpg", "jpeg", "png"}:
             image = Image.open(io.BytesIO(content))
@@ -195,209 +101,81 @@ def extract_text(content: bytes, file_ext: str) -> str:
     except Exception as e:
         logger.error(f"Text extraction failed: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_422_UNPROCESSABLE_ENTITY,
-            detail="Failed to extract text from file",
-            details={"error": str(e), "file_type": file_ext}
-        )
-def preprocess_text(text: str) -> str:
-    """Clean and normalize extracted text."""
-    text = re.sub(r'\s+', ' ', text).strip()
-    return text[:MAX_TEXT_LENGTH] if len(text) > MAX_TEXT_LENGTH else text
-def chunk_text(text: str, chunk_size: int = 1000) -> List[str]:
-    """Split text into chunks for processing."""
-    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
-# API Endpoints
-@app.get("/", response_class=HTMLResponse)
-async def home(request: Request):
-    return templates.TemplateResponse("index.html", {"request": request})
-@app.get("/health")
-async def health_check():
-    return {"status": "healthy", "timestamp": datetime.now().isoformat()}
 @app.post("/summarize")
 async def summarize_document(file: UploadFile = File(...)):
     try:
-        file_ext, content = await validate_and_read_file(file)
-        text = preprocess_text(extract_text(content, file_ext))
         if not text.strip():
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="No extractable text found in document"
-            )
-        model_name = MODEL_MAPPING.get("en", {}).get("summarization", "facebook/bart-large-cnn")
-        summarizer = get_model(model_name, "summarization")
-        chunks = chunk_text(text)
         summaries = []
         for chunk in chunks:
             summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
             summaries.append(summary)
-        return {
-            "success": True,
-            "summary": " ".join(summaries),
-            "language": "en",
-            "processed_chunks": len(chunks)
-        }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Summarization failed: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Document summarization failed",
-            details={"error": str(e)}
-        )
 @app.post("/qa")
 async def question_answering(
     file: UploadFile = File(...),
     question: str = Form(...),
-    language: str = Form(DEFAULT_LANGUAGE)
 ):
     try:
-        file_ext, content = await validate_and_read_file(file)
-        text = preprocess_text(extract_text(content, file_ext))
-        # Theme detection
-        theme_keywords = {
-            "fr": ["thème", "sujet principal", "quoi le sujet"],
-            "en": ["theme", "main topic", "what is about"]
-        }
-        is_theme_question = any(
-            kw in question.lower()
-            for kw in theme_keywords.get(language, theme_keywords["en"])
-        )
-        if is_theme_question:
-            model_name = MODEL_MAPPING.get(language, {}).get("summarization")
-            if not model_name:
-                model_name = MODEL_MAPPING["default"].get("summarization")
-            generator = get_model(model_name, "text-generation")
-            theme_prompt = (
-                "Extract the main theme of this text in 1-2 sentences. "
-                "Respond as if explaining to a beginner. "
-                "Text: {text}"
-            )
-            response = generator(
-                theme_prompt.format(text=text[:2000]),
-                max_length=200,
-                num_return_sequences=1,
-                do_sample=False
-            )
-            theme = response[0]["generated_text"].split(":")[-1].strip()
-            theme = re.sub(r"^(Le|La)\s+", "", theme)
             return {
                 "question": question,
-                "answer": f"The document mainly discusses: {theme}",
                 "confidence": 0.95,
-                "language": language,
-                "processing_method": "theme_analysis",
-                "success": True
             }
         # Standard QA processing
-        model_name = MODEL_MAPPING.get(language, {}).get("qa")
-        if not model_name:
-            model_name = MODEL_MAPPING["default"].get("qa")
-        qa_model = get_model(model_name, "question-answering")
-        result = qa_model(question=question, context=text)
-        if result["score"] < 0.1:
-            return {
-                "question": question,
-                "answer": "No clear answer found in the document" if language == "en" else "Aucune réponse claire trouvée dans le document",
-                "confidence": result["score"],
-                "language": language,
-                "warning": "low_confidence",
-                "success": True
-            }
         return {
             "question": question,
             "answer": result["answer"],
             "confidence": result["score"],
-            "language": language,
-            "success": True
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"QA processing failed: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Document analysis failed",
-            details={"error": str(e)}
-        )
-@app.post("/api/caption")
-async def caption_image(file: UploadFile = File(...)):
-    try:
-        file_ext, content = await validate_and_read_file(file)
-        if file_ext not in {"jpg", "jpeg", "png"}:
-            raise HTTPException(
-                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Only image files are supported for captioning"
-            )
-        image = Image.open(io.BytesIO(content)).convert("RGB")
-        captioner = get_model(MODEL_MAPPING["default"]["image_captioning"], "image-to-text")
-        caption = captioner(image)[0]['generated_text']
-        return {
-            "success": True,
-            "caption": caption,
-            "file_type": file_ext
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Image captioning failed: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Image captioning failed",
-            details={"error": str(e)}
-        )
-@app.post("/translate")
-async def translate_text(
-    text: str = Form(...),
-    target_lang: str = Form(...),
-    src_lang: str = Form("eng_Latn")
-):
-    try:
-        translator = get_model(MODEL_MAPPING["default"]["multilingual_translation"], "translation")
-        translated = translator(text, src_lang=src_lang, tgt_lang=target_lang)
-        return {
-            "success": True,
-            "translated_text": translated[0]["translation_text"],
-            "source_language": src_lang,
-            "target_language": target_lang
-        }
-    except Exception as e:
-        logger.error(f"Translation failed: {str(e)}")
-        raise HTTPException(
-            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="Text translation failed",
-            details={"error": str(e)}
-        )
-# Run the application
 if __name__ == "__main__":
-    port = int(os.environ.get("PORT", 7860))
-    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from transformers import pipeline
+from typing import Optional
 import io
 import fitz  # PyMuPDF
 from PIL import Image
 from pptx import Presentation
 import pytesseract
 import logging
 import re
 # Configure logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+app = FastAPI()
+# CORS Configuration
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_methods=["*"],
     allow_headers=["*"],
 )
 # Constants
 MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
 SUPPORTED_FILE_TYPES = {
+    "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
 }
+# Model caching
+summarizer = None
+qa_model = None
+image_captioner = None
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    return summarizer
+def get_qa_model():
+    global qa_model
+    if qa_model is None:
+        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+    return qa_model
+def get_image_captioner():
+    global image_captioner
+    if image_captioner is None:
+        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+    return image_captioner
+async def process_uploaded_file(file: UploadFile):
+    if not file.filename:
+        raise HTTPException(400, "No file provided")
+    file_ext = file.filename.split('.')[-1].lower()
     if file_ext not in SUPPORTED_FILE_TYPES:
+        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")
     content = await file.read()
     if len(content) > MAX_FILE_SIZE:
+        raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
     return file_ext, content
 def extract_text(content: bytes, file_ext: str) -> str:
     try:
         if file_ext == "docx":
             doc = Document(io.BytesIO(content))
         elif file_ext == "pdf":
             pdf = fitz.open(stream=content, filetype="pdf")
+            text = []
+            for page in pdf:
+                page_text = page.get_text("text")
+                if page_text.strip():
+                    text.append(page_text)
+            return " ".join(text)
         elif file_ext in {"jpg", "jpeg", "png"}:
             image = Image.open(io.BytesIO(content))
     except Exception as e:
         logger.error(f"Text extraction failed: {str(e)}")
+        raise HTTPException(422, f"Failed to extract text from {file_ext} file")
 @app.post("/summarize")
 async def summarize_document(file: UploadFile = File(...)):
     try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
         if not text.strip():
+            raise HTTPException(400, "No extractable text found")
+        # Clean and chunk text
+        text = re.sub(r'\s+', ' ', text).strip()
+        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+        # Summarize each chunk
+        summarizer = get_summarizer()
         summaries = []
         for chunk in chunks:
             summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
             summaries.append(summary)
+        return {"summary": " ".join(summaries)}
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"Summarization failed: {str(e)}")
+        raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 async def question_answering(
     file: UploadFile = File(...),
     question: str = Form(...),
+    language: str = Form("fr")
 ):
     try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
+        if not text.strip():
+            raise HTTPException(400, "No extractable text found")
+        # Clean text
+        text = re.sub(r'\s+', ' ', text).strip()
+        # Handle theme questions
+        theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
+        if any(kw in question.lower() for kw in theme_keywords):
+            # Use summarization for theme detection
+            summarizer = get_summarizer()
+            theme = summarizer(text, max_length=100, min_length=30, do_sample=False)[0]["summary_text"]
             return {
                 "question": question,
+                "answer": f"Le document traite principalement de : {theme}",
                 "confidence": 0.95,
+                "language": language
             }
         # Standard QA processing
+        qa = get_qa_model()
+        result = qa(question=question, context=text)
         return {
             "question": question,
             "answer": result["answer"],
             "confidence": result["score"],
+            "language": language
         }
     except HTTPException:
         raise
     except Exception as e:
         logger.error(f"QA processing failed: {str(e)}")
+        raise HTTPException(500, "Document analysis failed")
 if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)