Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 10

Commit

8ea794b

verified ·

1 Parent(s): 051e65c

Update main.py

Browse files

Files changed (1) hide show

main.py +537 -129

main.py CHANGED Viewed

@@ -1,28 +1,45 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
-from fastapi.middleware.cors import CORSMiddleware
-from starlette.requests import Request
-import pytesseract
-from PIL import Image
-import fitz  # PyMuPDF
-import docx
-import pptx
-import pandas as pd
-import io
-from transformers import pipeline
 import matplotlib.pyplot as plt
 import seaborn as sns
-import uuid
-import os
 app = FastAPI()
-# CORS (optional, for frontend access)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -30,124 +47,515 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Rate Limiting
-limiter = Limiter(key_func=get_remote_address)
-app.state.limiter = limiter
-@app.exception_handler(RateLimitExceeded)
-async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
-    return JSONResponse(
-        status_code=429,
-        content={"error": "Rate limit exceeded. Please try again later."}
     )
-# Hugging Face Pipelines
-summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
-image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
-# Utility: Save image and return path
-def save_temp_image(upload: UploadFile):
-    image_path = f"temp/{uuid.uuid4().hex}_{upload.filename}"
-    with open(image_path, "wb") as f:
-        f.write(upload.file.read())
-    return image_path
-# --- File Parsing Utilities ---
-def extract_text_from_pdf(file_bytes: bytes) -> str:
-    doc = fitz.open(stream=file_bytes, filetype="pdf")
-    return "\n".join(page.get_text() for page in doc)
-def extract_text_from_docx(file_bytes: bytes) -> str:
-    doc = docx.Document(io.BytesIO(file_bytes))
-    return "\n".join(p.text for p in doc.paragraphs)
-def extract_text_from_pptx(file_bytes: bytes) -> str:
-    prs = pptx.Presentation(io.BytesIO(file_bytes))
-    text = ""
-    for slide in prs.slides:
-        for shape in slide.shapes:
-            if hasattr(shape, "text"):
-                text += shape.text + "\n"
-    return text
-def extract_text_from_image(file_bytes: bytes) -> str:
-    img = Image.open(io.BytesIO(file_bytes))
-    return pytesseract.image_to_string(img)
-def extract_data_from_excel(file_bytes: bytes) -> pd.DataFrame:
-    return pd.read_excel(io.BytesIO(file_bytes))
-# --- API Endpoints ---
-@app.post("/process/")
-@limiter.limit("10/minute")
-async def process_file(
     request: Request,
     file: UploadFile = File(...),
-    task: str = Form(...),
-    question: str = Form(None)
 ):
-    content_type = file.content_type
-    file_bytes = await file.read()
-    # --- Task: Summarization or QA ---
-    if task in ["summarization", "question_answering"]:
-        if content_type == "application/pdf":
-            text = extract_text_from_pdf(file_bytes)
-        elif content_type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
-            text = extract_text_from_docx(file_bytes)
-        elif content_type in ["application/vnd.openxmlformats-officedocument.presentationml.presentation"]:
-            text = extract_text_from_pptx(file_bytes)
-        elif content_type in ["image/png", "image/jpeg"]:
-            text = extract_text_from_image(file_bytes)
-        else:
-            raise HTTPException(status_code=400, detail="Unsupported file format for this task.")
-        if task == "summarization":
-            summary = summarizer(text[:3000])[0]["summary_text"]  # truncate long text
-            return {"summary": summary}
-        if task == "question_answering":
-            if not question:
-                raise HTTPException(status_code=400, detail="Question is required for QA.")
-            answer = qa_pipeline(question=question, context=text)
-            return {"answer": answer["answer"]}
-    # --- Task: Image Captioning ---
-    elif task == "captioning":
-        if content_type not in ["image/png", "image/jpeg"]:
-            raise HTTPException(status_code=400, detail="Only image files supported for captioning.")
-        image_path = save_temp_image(file)
-        caption = image_captioner(image_path)[0]["generated_text"]
-        os.remove(image_path)
-        return {"caption": caption}
-    # --- Task: Data Visualization ---
-    elif task == "visualization":
-        if content_type != "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
-            raise HTTPException(status_code=400, detail="Only Excel files supported for visualization.")
-        df = extract_data_from_excel(file_bytes)
-        if df.empty:
-            raise HTTPException(status_code=400, detail="No data found in Excel file.")
-        # Example visualization: correlation heatmap
-        numeric_df = df.select_dtypes(include="number")
-        if numeric_df.empty:
-            raise HTTPException(status_code=400, detail="No numeric data available for visualization.")
-        plt.figure(figsize=(10, 6))
-        sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
-        viz_path = f"temp/viz_{uuid.uuid4().hex}.png"
-        plt.savefig(viz_path)
-        plt.close()
-        with open(viz_path, "rb") as img_file:
-            img_bytes = img_file.read()
-        os.remove(viz_path)
-        return JSONResponse(content={"image_bytes": list(img_bytes)})
-    else:
-        raise HTTPException(status_code=400, detail="Unsupported task.")

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
+from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
+from transformers import pipeline
+from typing import Tuple, Optional
+import io
+import fitz  # PyMuPDF
+from PIL import Image
+import pandas as pd
+import uvicorn
+from docx import Document
+from pptx import Presentation
+import pytesseract
+import logging
+import re
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
+from slowapi.middleware import SlowAPIMiddleware
 import matplotlib.pyplot as plt
 import seaborn as sns
+import tempfile
+import base64
+from io import BytesIO
+from pydantic import BaseModel
+import traceback
+import ast
+# Initialize rate limiter
+limiter = Limiter(key_func=get_remote_address)
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
 app = FastAPI()
+# Apply rate limiting middleware
+app.state.limiter = limiter
+app.add_middleware(SlowAPIMiddleware)
+# CORS Configuration
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Constants
+MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
+SUPPORTED_FILE_TYPES = {
+    "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
+}
+# Model caching
+summarizer = None
+qa_model = None
+image_captioner = None
+def get_summarizer():
+    global summarizer
+    if summarizer is None:
+        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+    return summarizer
+def get_qa_model():
+    global qa_model
+    if qa_model is None:
+        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
+    return qa_model
+def get_image_captioner():
+    global image_captioner
+    if image_captioner is None:
+        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
+    return image_captioner
+async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
+    """Validate and process uploaded file with special handling for each type"""
+    if not file.filename:
+        raise HTTPException(400, "No filename provided")
+    file_ext = file.filename.split('.')[-1].lower()
+    if file_ext not in SUPPORTED_FILE_TYPES:
+        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")
+    content = await file.read()
+    if len(content) > MAX_FILE_SIZE:
+        raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
+    # Special validation for PDFs
+    if file_ext == "pdf":
+        try:
+            with fitz.open(stream=content, filetype="pdf") as doc:
+                if doc.is_encrypted:
+                    if not doc.authenticate(""):
+                        raise ValueError("Encrypted PDF - cannot extract text")
+                if len(doc) > 50:
+                    raise ValueError("PDF too large (max 50 pages)")
+        except Exception as e:
+            logger.error(f"PDF validation failed: {str(e)}")
+            raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")
+    await file.seek(0)  # Reset file pointer for processing
+    return file_ext, content
+def extract_text(content: bytes, file_ext: str) -> str:
+    """Extract text from various file formats with enhanced support"""
+    try:
+        if file_ext == "docx":
+            doc = Document(io.BytesIO(content))
+            return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
+        elif file_ext in {"xlsx", "xls"}:
+            df = pd.read_excel(io.BytesIO(content), sheet_name=None)
+            all_text = []
+            for sheet_name, sheet_data in df.items():
+                sheet_text = []
+                for column in sheet_data.columns:
+                    sheet_text.extend(sheet_data[column].dropna().astype(str).tolist())
+                all_text.append(f"Sheet: {sheet_name}\n" + "\n".join(sheet_text))
+            return "\n\n".join(all_text)
+        elif file_ext == "pptx":
+            ppt = Presentation(io.BytesIO(content))
+            text = []
+            for slide in ppt.slides:
+                for shape in slide.shapes:
+                    if hasattr(shape, "text") and shape.text.strip():
+                        text.append(shape.text)
+            return "\n".join(text)
+        elif file_ext == "pdf":
+            pdf = fitz.open(stream=content, filetype="pdf")
+            return "\n".join(page.get_text("text") for page in pdf)
+        elif file_ext in {"jpg", "jpeg", "png"}:
+            # First try OCR
+            try:
+                image = Image.open(io.BytesIO(content))
+                text = pytesseract.image_to_string(image, config='--psm 6')
+                if text.strip():
+                    return text
+                # If OCR fails, try image captioning
+                captioner = get_image_captioner()
+                result = captioner(image)
+                return result[0]['generated_text']
+            except Exception as img_e:
+                logger.error(f"Image processing failed: {str(img_e)}")
+                raise ValueError("Could not extract text or caption from image")
+    except Exception as e:
+        logger.error(f"Text extraction failed for {file_ext}: {str(e)}")
+        raise HTTPException(422, f"Failed to extract text from {file_ext} file")
+# Visualization Models
+class VisualizationRequest(BaseModel):
+    chart_type: str
+    x_column: Optional[str] = None
+    y_column: Optional[str] = None
+    hue_column: Optional[str] = None
+    title: Optional[str] = None
+    x_label: Optional[str] = None
+    y_label: Optional[str] = None
+    style: str = "seaborn"
+    filters: Optional[dict] = None
+class NaturalLanguageRequest(BaseModel):
+    prompt: str
+    style: str = "seaborn"
+def generate_visualization_code(df: pd.DataFrame, request: VisualizationRequest) -> str:
+    """Generate Python code for visualization based on request parameters"""
+    code_lines = [
+        "import matplotlib.pyplot as plt",
+        "import seaborn as sns",
+        "import pandas as pd",
+        "",
+        "# Data preparation",
+        f"df = pd.DataFrame({df.to_dict(orient='list')})",
+    ]
+    # Apply filters if specified
+    if request.filters:
+        filter_conditions = []
+        for column, condition in request.filters.items():
+            if isinstance(condition, dict):
+                if 'min' in condition and 'max' in condition:
+                    filter_conditions.append(f"(df['{column}'] >= {condition['min']}) & (df['{column}'] <= {condition['max']})")
+                elif 'values' in condition:
+                    values = ', '.join([f"'{v}'" if isinstance(v, str) else str(v) for v in condition['values']])
+                    filter_conditions.append(f"df['{column}'].isin([{values}])")
+            else:
+                filter_conditions.append(f"df['{column}'] == {repr(condition)}")
+        if filter_conditions:
+            code_lines.extend([
+                "",
+                "# Apply filters",
+                f"df = df[{' & '.join(filter_conditions)}]"
+            ])
+    code_lines.extend([
+        "",
+        "# Visualization",
+        f"plt.style.use('{request.style}')",
+        f"plt.figure(figsize=(10, 6))"
+    ])
+    # Chart type specific code
+    if request.chart_type == "line":
+        if request.hue_column:
+            code_lines.append(f"sns.lineplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
+        else:
+            code_lines.append(f"plt.plot(df['{request.x_column}'], df['{request.y_column}'])")
+    elif request.chart_type == "bar":
+        if request.hue_column:
+            code_lines.append(f"sns.barplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
+        else:
+            code_lines.append(f"plt.bar(df['{request.x_column}'], df['{request.y_column}'])")
+    elif request.chart_type == "scatter":
+        if request.hue_column:
+            code_lines.append(f"sns.scatterplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
+        else:
+            code_lines.append(f"plt.scatter(df['{request.x_column}'], df['{request.y_column}'])")
+    elif request.chart_type == "histogram":
+        code_lines.append(f"plt.hist(df['{request.x_column}'], bins=20)")
+    elif request.chart_type == "boxplot":
+        if request.hue_column:
+            code_lines.append(f"sns.boxplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
+        else:
+            code_lines.append(f"sns.boxplot(data=df, x='{request.x_column}', y='{request.y_column}')")
+    elif request.chart_type == "heatmap":
+        code_lines.append(f"corr = df.corr()")
+        code_lines.append(f"sns.heatmap(corr, annot=True, cmap='coolwarm')")
+    else:
+        raise ValueError(f"Unsupported chart type: {request.chart_type}")
+    # Add labels and title
+    if request.title:
+        code_lines.append(f"plt.title('{request.title}')")
+    if request.x_label:
+        code_lines.append(f"plt.xlabel('{request.x_label}')")
+    if request.y_label:
+        code_lines.append(f"plt.ylabel('{request.y_label}')")
+    code_lines.extend([
+        "plt.tight_layout()",
+        "plt.show()"
+    ])
+    return "\n".join(code_lines)
+def interpret_natural_language(prompt: str, df_columns: list) -> VisualizationRequest:
+    """Convert natural language prompt to visualization parameters"""
+    # Simple keyword-based interpretation (could be enhanced with NLP)
+    prompt = prompt.lower()
+    # Determine chart type
+    chart_type = "bar"
+    if "line" in prompt:
+        chart_type = "line"
+    elif "scatter" in prompt:
+        chart_type = "scatter"
+    elif "histogram" in prompt:
+        chart_type = "histogram"
+    elif "box" in prompt:
+        chart_type = "boxplot"
+    elif "heatmap" in prompt or "correlation" in prompt:
+        chart_type = "heatmap"
+    # Try to detect columns
+    x_col = None
+    y_col = None
+    hue_col = None
+    for col in df_columns:
+        if col.lower() in prompt:
+            if not x_col:
+                x_col = col
+            elif not y_col:
+                y_col = col
+            else:
+                hue_col = col
+    # Default to first columns if not detected
+    if not x_col and len(df_columns) > 0:
+        x_col = df_columns[0]
+    if not y_col and len(df_columns) > 1:
+        y_col = df_columns[1]
+    return VisualizationRequest(
+        chart_type=chart_type,
+        x_column=x_col,
+        y_column=y_col,
+        hue_column=hue_col,
+        title="Generated from: " + prompt[:50] + ("..." if len(prompt) > 50 else ""),
+        style="seaborn"
     )
+@app.post("/summarize")
+@limiter.limit("5/minute")
+async def summarize_document(request: Request, file: UploadFile = File(...)):
+    try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
+        if not text.strip():
+            raise HTTPException(400, "No extractable text found")
+        # Clean and chunk text
+        text = re.sub(r'\s+', ' ', text).strip()
+        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
+        # Summarize each chunk
+        summarizer = get_summarizer()
+        summaries = []
+        for chunk in chunks:
+            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
+            summaries.append(summary)
+        return {"summary": " ".join(summaries)}
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}")
+        raise HTTPException(500, "Document summarization failed")
+@app.post("/qa")
+@limiter.limit("5/minute")
+async def question_answering(
     request: Request,
     file: UploadFile = File(...),
+    question: str = Form(...),
+    language: str = Form("fr")
 ):
+    try:
+        file_ext, content = await process_uploaded_file(file)
+        text = extract_text(content, file_ext)
+        if not text.strip():
+            raise HTTPException(400, "No extractable text found")
+        # Clean and truncate text
+        text = re.sub(r'\s+', ' ', text).strip()[:5000]
+        # Theme detection
+        theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
+        if any(kw in question.lower() for kw in theme_keywords):
+            try:
+                summarizer = get_summarizer()
+                summary_output = summarizer(
+                    text,
+                    max_length=min(100, len(text)//4),
+                    min_length=30,
+                    do_sample=False,
+                    truncation=True
+                )
+                theme = summary_output[0].get("summary_text", text[:200] + "...")
+                return {
+                    "question": question,
+                    "answer": f"Le document traite principalement de : {theme}",
+                    "confidence": 0.95,
+                    "language": language
+                }
+            except Exception:
+                theme = text[:200] + ("..." if len(text) > 200 else "")
+                return {
+                    "question": question,
+                    "answer": f"D'après le document : {theme}",
+                    "confidence": 0.7,
+                    "language": language,
+                    "warning": "theme_summary_fallback"
+                }
+        # Standard QA
+        qa = get_qa_model()
+        result = qa(question=question, context=text[:3000])
+        return {
+            "question": question,
+            "answer": result["answer"],
+            "confidence": result["score"],
+            "language": language
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"QA processing failed: {str(e)}")
+        raise HTTPException(500, detail=f"Analysis failed: {str(e)}")
+@app.post("/visualize/code")
+@limiter.limit("5/minute")
+async def visualize_with_code(
+    request: Request,
+    file: UploadFile = File(...),
+    chart_type: str = Form(...),
+    x_column: Optional[str] = Form(None),
+    y_column: Optional[str] = Form(None),
+    hue_column: Optional[str] = Form(None),
+    title: Optional[str] = Form(None),
+    x_label: Optional[str] = Form(None),
+    y_label: Optional[str] = Form(None),
+    style: str = Form("seaborn"),
+    filters: Optional[str] = Form(None)
+):
+    try:
+        # Validate file
+        file_ext, content = await process_uploaded_file(file)
+        if file_ext not in {"xlsx", "xls"}:
+            raise HTTPException(400, "Only Excel files are supported for visualization")
+        # Read Excel file
+        df = pd.read_excel(io.BytesIO(content))
+        # Parse filters if provided
+        filter_dict = {}
+        if filters:
+            try:
+                filter_dict = ast.literal_eval(filters)
+                if not isinstance(filter_dict, dict):
+                    filter_dict = {}
+            except:
+                filter_dict = {}
+        # Create visualization request
+        vis_request = VisualizationRequest(
+            chart_type=chart_type,
+            x_column=x_column,
+            y_column=y_column,
+            hue_column=hue_column,
+            title=title,
+            x_label=x_label,
+            y_label=y_label,
+            style=style,
+            filters=filter_dict
+        )
+        # Generate visualization code
+        visualization_code = generate_visualization_code(df, vis_request)
+        # Execute the code to generate the plot
+        plt.figure()
+        local_vars = {}
+        exec(visualization_code, globals(), local_vars)
+        # Save the plot to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
+            plt.savefig(tmpfile.name, format='png', dpi=300)
+            plt.close()
+            # Read the image back as bytes
+            with open(tmpfile.name, "rb") as f:
+                image_bytes = f.read()
+        # Encode image as base64
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        return {
+            "status": "success",
+            "image": f"data:image/png;base64,{image_base64}",
+            "code": visualization_code,
+            "data_preview": df.head().to_dict(orient='records')
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Visualization failed: {str(e)}\n{traceback.format_exc()}")
+        raise HTTPException(500, detail=f"Visualization failed: {str(e)}")
+@app.post("/visualize/natural")
+@limiter.limit("5/minute")
+async def visualize_with_natural_language(
+    request: Request,
+    file: UploadFile = File(...),
+    prompt: str = Form(...),
+    style: str = Form("seaborn")
+):
+    try:
+        # Validate file
+        file_ext, content = await process_uploaded_file(file)
+        if file_ext not in {"xlsx", "xls"}:
+            raise HTTPException(400, "Only Excel files are supported for visualization")
+        # Read Excel file
+        df = pd.read_excel(io.BytesIO(content))
+        # Convert natural language to visualization parameters
+        nl_request = NaturalLanguageRequest(prompt=prompt, style=style)
+        vis_request = interpret_natural_language(nl_request.prompt, df.columns.tolist())
+        # Generate visualization code
+        visualization_code = generate_visualization_code(df, vis_request)
+        # Execute the code to generate the plot
+        plt.figure()
+        local_vars = {}
+        exec(visualization_code, globals(), local_vars)
+        # Save the plot to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
+            plt.savefig(tmpfile.name, format='png', dpi=300)
+            plt.close()
+            # Read the image back as bytes
+            with open(tmpfile.name, "rb") as f:
+                image_bytes = f.read()
+        # Encode image as base64
+        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
+        return {
+            "status": "success",
+            "image": f"data:image/png;base64,{image_base64}",
+            "code": visualization_code,
+            "interpreted_parameters": vis_request.dict(),
+            "data_preview": df.head().to_dict(orient='records')
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.error(f"Natural language visualization failed: {str(e)}\n{traceback.format_exc()}")
+        raise HTTPException(500, detail=f"Visualization failed: {str(e)}")
+@app.post("/get_columns")
+@limiter.limit("10/minute")
+async def get_excel_columns(
+    request: Request,
+    file: UploadFile = File(...)
+):
+    try:
+        file_ext, content = await process_uploaded_file(file)
+        if file_ext not in {"xlsx", "xls"}:
+            raise HTTPException(400, "Only Excel files are supported")
+        df = pd.read_excel(io.BytesIO(content))
+        return {
+            "columns": list(df.columns),
+            "sample_data": df.head().to_dict(orient='records'),
+            "statistics": df.describe().to_dict() if len(df.select_dtypes(include=['number']).columns) > 0 else None
+        }
+    except Exception as e:
+        logger.error(f"Column extraction failed: {str(e)}")
+        raise HTTPException(500, detail="Failed to extract columns from Excel file")
+@app.exception_handler(RateLimitExceeded)
+async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+    return JSONResponse(
+        status_code=429,
+        content={"detail": "Too many requests. Please try again later."}
+    )
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)