Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 10

Commit

051e65c

verified ·

1 Parent(s): fc8ee72

Update main.py

Browse files

Files changed (1) hide show

main.py +129 -537

main.py CHANGED Viewed

@@ -1,45 +1,28 @@
-from fastapi import FastAPI, UploadFile, File, Form, HTTPException, Request
-from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
-from transformers import pipeline
-from typing import Tuple, Optional
-import io
-import fitz  # PyMuPDF
-from PIL import Image
-import pandas as pd
-import uvicorn
-from docx import Document
-from pptx import Presentation
-import pytesseract
-import logging
-import re
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
-from slowapi.middleware import SlowAPIMiddleware
-import matplotlib.pyplot as plt
-import seaborn as sns
-import tempfile
-import base64
-from io import BytesIO
-from pydantic import BaseModel
-import traceback
-import ast
-# Initialize rate limiter
-limiter = Limiter(key_func=get_remote_address)
-# Configure logging
-logging.basicConfig(level=logging.INFO)
-logger = logging.getLogger(__name__)
 app = FastAPI()
-# Apply rate limiting middleware
-app.state.limiter = limiter
-app.add_middleware(SlowAPIMiddleware)
-# CORS Configuration
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -47,515 +30,124 @@ app.add_middleware(
     allow_headers=["*"],
 )
-# Constants
-MAX_FILE_SIZE = 10 * 1024 * 1024  # 10MB
-SUPPORTED_FILE_TYPES = {
-    "docx", "xlsx", "pptx", "pdf", "jpg", "jpeg", "png"
-}
-# Model caching
-summarizer = None
-qa_model = None
-image_captioner = None
-def get_summarizer():
-    global summarizer
-    if summarizer is None:
-        summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
-    return summarizer
-def get_qa_model():
-    global qa_model
-    if qa_model is None:
-        qa_model = pipeline("question-answering", model="deepset/roberta-base-squad2")
-    return qa_model
-def get_image_captioner():
-    global image_captioner
-    if image_captioner is None:
-        image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
-    return image_captioner
-async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
-    """Validate and process uploaded file with special handling for each type"""
-    if not file.filename:
-        raise HTTPException(400, "No filename provided")
-    file_ext = file.filename.split('.')[-1].lower()
-    if file_ext not in SUPPORTED_FILE_TYPES:
-        raise HTTPException(400, f"Unsupported file type. Supported: {', '.join(SUPPORTED_FILE_TYPES)}")
-    content = await file.read()
-    if len(content) > MAX_FILE_SIZE:
-        raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
-    # Special validation for PDFs
-    if file_ext == "pdf":
-        try:
-            with fitz.open(stream=content, filetype="pdf") as doc:
-                if doc.is_encrypted:
-                    if not doc.authenticate(""):
-                        raise ValueError("Encrypted PDF - cannot extract text")
-                if len(doc) > 50:
-                    raise ValueError("PDF too large (max 50 pages)")
-        except Exception as e:
-            logger.error(f"PDF validation failed: {str(e)}")
-            raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")
-    await file.seek(0)  # Reset file pointer for processing
-    return file_ext, content
-def extract_text(content: bytes, file_ext: str) -> str:
-    """Extract text from various file formats with enhanced support"""
-    try:
-        if file_ext == "docx":
-            doc = Document(io.BytesIO(content))
-            return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
-        elif file_ext in {"xlsx", "xls"}:
-            df = pd.read_excel(io.BytesIO(content), sheet_name=None)
-            all_text = []
-            for sheet_name, sheet_data in df.items():
-                sheet_text = []
-                for column in sheet_data.columns:
-                    sheet_text.extend(sheet_data[column].dropna().astype(str).tolist())
-                all_text.append(f"Sheet: {sheet_name}\n" + "\n".join(sheet_text))
-            return "\n\n".join(all_text)
-        elif file_ext == "pptx":
-            ppt = Presentation(io.BytesIO(content))
-            text = []
-            for slide in ppt.slides:
-                for shape in slide.shapes:
-                    if hasattr(shape, "text") and shape.text.strip():
-                        text.append(shape.text)
-            return "\n".join(text)
-        elif file_ext == "pdf":
-            pdf = fitz.open(stream=content, filetype="pdf")
-            return "\n".join(page.get_text("text") for page in pdf)
-        elif file_ext in {"jpg", "jpeg", "png"}:
-            # First try OCR
-            try:
-                image = Image.open(io.BytesIO(content))
-                text = pytesseract.image_to_string(image, config='--psm 6')
-                if text.strip():
-                    return text
-                # If OCR fails, try image captioning
-                captioner = get_image_captioner()
-                result = captioner(image)
-                return result[0]['generated_text']
-            except Exception as img_e:
-                logger.error(f"Image processing failed: {str(img_e)}")
-                raise ValueError("Could not extract text or caption from image")
-    except Exception as e:
-        logger.error(f"Text extraction failed for {file_ext}: {str(e)}")
-        raise HTTPException(422, f"Failed to extract text from {file_ext} file")
-# Visualization Models
-class VisualizationRequest(BaseModel):
-    chart_type: str
-    x_column: Optional[str] = None
-    y_column: Optional[str] = None
-    hue_column: Optional[str] = None
-    title: Optional[str] = None
-    x_label: Optional[str] = None
-    y_label: Optional[str] = None
-    style: str = "seaborn"
-    filters: Optional[dict] = None
-class NaturalLanguageRequest(BaseModel):
-    prompt: str
-    style: str = "seaborn"
-def generate_visualization_code(df: pd.DataFrame, request: VisualizationRequest) -> str:
-    """Generate Python code for visualization based on request parameters"""
-    code_lines = [
-        "import matplotlib.pyplot as plt",
-        "import seaborn as sns",
-        "import pandas as pd",
-        "",
-        "# Data preparation",
-        f"df = pd.DataFrame({df.to_dict(orient='list')})",
-    ]
-    # Apply filters if specified
-    if request.filters:
-        filter_conditions = []
-        for column, condition in request.filters.items():
-            if isinstance(condition, dict):
-                if 'min' in condition and 'max' in condition:
-                    filter_conditions.append(f"(df['{column}'] >= {condition['min']}) & (df['{column}'] <= {condition['max']})")
-                elif 'values' in condition:
-                    values = ', '.join([f"'{v}'" if isinstance(v, str) else str(v) for v in condition['values']])
-                    filter_conditions.append(f"df['{column}'].isin([{values}])")
-            else:
-                filter_conditions.append(f"df['{column}'] == {repr(condition)}")
-        if filter_conditions:
-            code_lines.extend([
-                "",
-                "# Apply filters",
-                f"df = df[{' & '.join(filter_conditions)}]"
-            ])
-    code_lines.extend([
-        "",
-        "# Visualization",
-        f"plt.style.use('{request.style}')",
-        f"plt.figure(figsize=(10, 6))"
-    ])
-    # Chart type specific code
-    if request.chart_type == "line":
-        if request.hue_column:
-            code_lines.append(f"sns.lineplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
-        else:
-            code_lines.append(f"plt.plot(df['{request.x_column}'], df['{request.y_column}'])")
-    elif request.chart_type == "bar":
-        if request.hue_column:
-            code_lines.append(f"sns.barplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
-        else:
-            code_lines.append(f"plt.bar(df['{request.x_column}'], df['{request.y_column}'])")
-    elif request.chart_type == "scatter":
-        if request.hue_column:
-            code_lines.append(f"sns.scatterplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
-        else:
-            code_lines.append(f"plt.scatter(df['{request.x_column}'], df['{request.y_column}'])")
-    elif request.chart_type == "histogram":
-        code_lines.append(f"plt.hist(df['{request.x_column}'], bins=20)")
-    elif request.chart_type == "boxplot":
-        if request.hue_column:
-            code_lines.append(f"sns.boxplot(data=df, x='{request.x_column}', y='{request.y_column}', hue='{request.hue_column}')")
-        else:
-            code_lines.append(f"sns.boxplot(data=df, x='{request.x_column}', y='{request.y_column}')")
-    elif request.chart_type == "heatmap":
-        code_lines.append(f"corr = df.corr()")
-        code_lines.append(f"sns.heatmap(corr, annot=True, cmap='coolwarm')")
-    else:
-        raise ValueError(f"Unsupported chart type: {request.chart_type}")
-    # Add labels and title
-    if request.title:
-        code_lines.append(f"plt.title('{request.title}')")
-    if request.x_label:
-        code_lines.append(f"plt.xlabel('{request.x_label}')")
-    if request.y_label:
-        code_lines.append(f"plt.ylabel('{request.y_label}')")
-    code_lines.extend([
-        "plt.tight_layout()",
-        "plt.show()"
-    ])
-    return "\n".join(code_lines)
-def interpret_natural_language(prompt: str, df_columns: list) -> VisualizationRequest:
-    """Convert natural language prompt to visualization parameters"""
-    # Simple keyword-based interpretation (could be enhanced with NLP)
-    prompt = prompt.lower()
-    # Determine chart type
-    chart_type = "bar"
-    if "line" in prompt:
-        chart_type = "line"
-    elif "scatter" in prompt:
-        chart_type = "scatter"
-    elif "histogram" in prompt:
-        chart_type = "histogram"
-    elif "box" in prompt:
-        chart_type = "boxplot"
-    elif "heatmap" in prompt or "correlation" in prompt:
-        chart_type = "heatmap"
-    # Try to detect columns
-    x_col = None
-    y_col = None
-    hue_col = None
-    for col in df_columns:
-        if col.lower() in prompt:
-            if not x_col:
-                x_col = col
-            elif not y_col:
-                y_col = col
-            else:
-                hue_col = col
-    # Default to first columns if not detected
-    if not x_col and len(df_columns) > 0:
-        x_col = df_columns[0]
-    if not y_col and len(df_columns) > 1:
-        y_col = df_columns[1]
-    return VisualizationRequest(
-        chart_type=chart_type,
-        x_column=x_col,
-        y_column=y_col,
-        hue_column=hue_col,
-        title="Generated from: " + prompt[:50] + ("..." if len(prompt) > 50 else ""),
-        style="seaborn"
     )
-@app.post("/summarize")
-@limiter.limit("5/minute")
-async def summarize_document(request: Request, file: UploadFile = File(...)):
-    try:
-        file_ext, content = await process_uploaded_file(file)
-        text = extract_text(content, file_ext)
-        if not text.strip():
-            raise HTTPException(400, "No extractable text found")
-        # Clean and chunk text
-        text = re.sub(r'\s+', ' ', text).strip()
-        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-        # Summarize each chunk
-        summarizer = get_summarizer()
-        summaries = []
-        for chunk in chunks:
-            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
-            summaries.append(summary)
-        return {"summary": " ".join(summaries)}
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Summarization failed: {str(e)}")
-        raise HTTPException(500, "Document summarization failed")
-@app.post("/qa")
-@limiter.limit("5/minute")
-async def question_answering(
-    request: Request,
-    file: UploadFile = File(...),
-    question: str = Form(...),
-    language: str = Form("fr")
-):
-    try:
-        file_ext, content = await process_uploaded_file(file)
-        text = extract_text(content, file_ext)
-        if not text.strip():
-            raise HTTPException(400, "No extractable text found")
-        # Clean and truncate text
-        text = re.sub(r'\s+', ' ', text).strip()[:5000]
-        # Theme detection
-        theme_keywords = ["thème", "sujet principal", "quoi le sujet", "theme", "main topic"]
-        if any(kw in question.lower() for kw in theme_keywords):
-            try:
-                summarizer = get_summarizer()
-                summary_output = summarizer(
-                    text,
-                    max_length=min(100, len(text)//4),
-                    min_length=30,
-                    do_sample=False,
-                    truncation=True
-                )
-                theme = summary_output[0].get("summary_text", text[:200] + "...")
-                return {
-                    "question": question,
-                    "answer": f"Le document traite principalement de : {theme}",
-                    "confidence": 0.95,
-                    "language": language
-                }
-            except Exception:
-                theme = text[:200] + ("..." if len(text) > 200 else "")
-                return {
-                    "question": question,
-                    "answer": f"D'après le document : {theme}",
-                    "confidence": 0.7,
-                    "language": language,
-                    "warning": "theme_summary_fallback"
-                }
-        # Standard QA
-        qa = get_qa_model()
-        result = qa(question=question, context=text[:3000])
-        return {
-            "question": question,
-            "answer": result["answer"],
-            "confidence": result["score"],
-            "language": language
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"QA processing failed: {str(e)}")
-        raise HTTPException(500, detail=f"Analysis failed: {str(e)}")
-@app.post("/visualize/code")
-@limiter.limit("5/minute")
-async def visualize_with_code(
-    request: Request,
-    file: UploadFile = File(...),
-    chart_type: str = Form(...),
-    x_column: Optional[str] = Form(None),
-    y_column: Optional[str] = Form(None),
-    hue_column: Optional[str] = Form(None),
-    title: Optional[str] = Form(None),
-    x_label: Optional[str] = Form(None),
-    y_label: Optional[str] = Form(None),
-    style: str = Form("seaborn"),
-    filters: Optional[str] = Form(None)
-):
-    try:
-        # Validate file
-        file_ext, content = await process_uploaded_file(file)
-        if file_ext not in {"xlsx", "xls"}:
-            raise HTTPException(400, "Only Excel files are supported for visualization")
-        # Read Excel file
-        df = pd.read_excel(io.BytesIO(content))
-        # Parse filters if provided
-        filter_dict = {}
-        if filters:
-            try:
-                filter_dict = ast.literal_eval(filters)
-                if not isinstance(filter_dict, dict):
-                    filter_dict = {}
-            except:
-                filter_dict = {}
-        # Create visualization request
-        vis_request = VisualizationRequest(
-            chart_type=chart_type,
-            x_column=x_column,
-            y_column=y_column,
-            hue_column=hue_column,
-            title=title,
-            x_label=x_label,
-            y_label=y_label,
-            style=style,
-            filters=filter_dict
-        )
-        # Generate visualization code
-        visualization_code = generate_visualization_code(df, vis_request)
-        # Execute the code to generate the plot
-        plt.figure()
-        local_vars = {}
-        exec(visualization_code, globals(), local_vars)
-        # Save the plot to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
-            plt.savefig(tmpfile.name, format='png', dpi=300)
-            plt.close()
-            # Read the image back as bytes
-            with open(tmpfile.name, "rb") as f:
-                image_bytes = f.read()
-        # Encode image as base64
-        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-        return {
-            "status": "success",
-            "image": f"data:image/png;base64,{image_base64}",
-            "code": visualization_code,
-            "data_preview": df.head().to_dict(orient='records')
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Visualization failed: {str(e)}\n{traceback.format_exc()}")
-        raise HTTPException(500, detail=f"Visualization failed: {str(e)}")
-@app.post("/visualize/natural")
-@limiter.limit("5/minute")
-async def visualize_with_natural_language(
-    request: Request,
-    file: UploadFile = File(...),
-    prompt: str = Form(...),
-    style: str = Form("seaborn")
-):
-    try:
-        # Validate file
-        file_ext, content = await process_uploaded_file(file)
-        if file_ext not in {"xlsx", "xls"}:
-            raise HTTPException(400, "Only Excel files are supported for visualization")
-        # Read Excel file
-        df = pd.read_excel(io.BytesIO(content))
-        # Convert natural language to visualization parameters
-        nl_request = NaturalLanguageRequest(prompt=prompt, style=style)
-        vis_request = interpret_natural_language(nl_request.prompt, df.columns.tolist())
-        # Generate visualization code
-        visualization_code = generate_visualization_code(df, vis_request)
-        # Execute the code to generate the plot
-        plt.figure()
-        local_vars = {}
-        exec(visualization_code, globals(), local_vars)
-        # Save the plot to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmpfile:
-            plt.savefig(tmpfile.name, format='png', dpi=300)
-            plt.close()
-            # Read the image back as bytes
-            with open(tmpfile.name, "rb") as f:
-                image_bytes = f.read()
-        # Encode image as base64
-        image_base64 = base64.b64encode(image_bytes).decode('utf-8')
-        return {
-            "status": "success",
-            "image": f"data:image/png;base64,{image_base64}",
-            "code": visualization_code,
-            "interpreted_parameters": vis_request.dict(),
-            "data_preview": df.head().to_dict(orient='records')
-        }
-    except HTTPException:
-        raise
-    except Exception as e:
-        logger.error(f"Natural language visualization failed: {str(e)}\n{traceback.format_exc()}")
-        raise HTTPException(500, detail=f"Visualization failed: {str(e)}")
-@app.post("/get_columns")
 @limiter.limit("10/minute")
-async def get_excel_columns(
     request: Request,
-    file: UploadFile = File(...)
 ):
-    try:
-        file_ext, content = await process_uploaded_file(file)
-        if file_ext not in {"xlsx", "xls"}:
-            raise HTTPException(400, "Only Excel files are supported")
-        df = pd.read_excel(io.BytesIO(content))
-        return {
-            "columns": list(df.columns),
-            "sample_data": df.head().to_dict(orient='records'),
-            "statistics": df.describe().to_dict() if len(df.select_dtypes(include=['number']).columns) > 0 else None
-        }
-    except Exception as e:
-        logger.error(f"Column extraction failed: {str(e)}")
-        raise HTTPException(500, detail="Failed to extract columns from Excel file")
-@app.exception_handler(RateLimitExceeded)
-async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
-    return JSONResponse(
-        status_code=429,
-        content={"detail": "Too many requests. Please try again later."}
-    )
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
 from fastapi.responses import JSONResponse
 from slowapi import Limiter
 from slowapi.util import get_remote_address
 from slowapi.errors import RateLimitExceeded
+from fastapi.middleware.cors import CORSMiddleware
+from starlette.requests import Request
+import pytesseract
+from PIL import Image
+import fitz  # PyMuPDF
+import docx
+import pptx
+import pandas as pd
+import io
+from transformers import pipeline
+import matplotlib.pyplot as plt
+import seaborn as sns
+import uuid
+import os
 app = FastAPI()
+# CORS (optional, for frontend access)
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
     allow_headers=["*"],
 )
+# Rate Limiting
+limiter = Limiter(key_func=get_remote_address)
+app.state.limiter = limiter
+@app.exception_handler(RateLimitExceeded)
+async def rate_limit_exceeded_handler(request: Request, exc: RateLimitExceeded):
+    return JSONResponse(
+        status_code=429,
+        content={"error": "Rate limit exceeded. Please try again later."}
     )
+# Hugging Face Pipelines
+summarizer = pipeline("summarization", model="facebook/bart-large-cnn")
+qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")
+image_captioner = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
+# Utility: Save image and return path
+def save_temp_image(upload: UploadFile):
+    image_path = f"temp/{uuid.uuid4().hex}_{upload.filename}"
+    with open(image_path, "wb") as f:
+        f.write(upload.file.read())
+    return image_path
+# --- File Parsing Utilities ---
+def extract_text_from_pdf(file_bytes: bytes) -> str:
+    doc = fitz.open(stream=file_bytes, filetype="pdf")
+    return "\n".join(page.get_text() for page in doc)
+def extract_text_from_docx(file_bytes: bytes) -> str:
+    doc = docx.Document(io.BytesIO(file_bytes))
+    return "\n".join(p.text for p in doc.paragraphs)
+def extract_text_from_pptx(file_bytes: bytes) -> str:
+    prs = pptx.Presentation(io.BytesIO(file_bytes))
+    text = ""
+    for slide in prs.slides:
+        for shape in slide.shapes:
+            if hasattr(shape, "text"):
+                text += shape.text + "\n"
+    return text
+def extract_text_from_image(file_bytes: bytes) -> str:
+    img = Image.open(io.BytesIO(file_bytes))
+    return pytesseract.image_to_string(img)
+def extract_data_from_excel(file_bytes: bytes) -> pd.DataFrame:
+    return pd.read_excel(io.BytesIO(file_bytes))
+# --- API Endpoints ---
+@app.post("/process/")
 @limiter.limit("10/minute")
+async def process_file(
     request: Request,
+    file: UploadFile = File(...),
+    task: str = Form(...),
+    question: str = Form(None)
 ):
+    content_type = file.content_type
+    file_bytes = await file.read()
+    # --- Task: Summarization or QA ---
+    if task in ["summarization", "question_answering"]:
+        if content_type == "application/pdf":
+            text = extract_text_from_pdf(file_bytes)
+        elif content_type in ["application/vnd.openxmlformats-officedocument.wordprocessingml.document"]:
+            text = extract_text_from_docx(file_bytes)
+        elif content_type in ["application/vnd.openxmlformats-officedocument.presentationml.presentation"]:
+            text = extract_text_from_pptx(file_bytes)
+        elif content_type in ["image/png", "image/jpeg"]:
+            text = extract_text_from_image(file_bytes)
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported file format for this task.")
+        if task == "summarization":
+            summary = summarizer(text[:3000])[0]["summary_text"]  # truncate long text
+            return {"summary": summary}
+        if task == "question_answering":
+            if not question:
+                raise HTTPException(status_code=400, detail="Question is required for QA.")
+            answer = qa_pipeline(question=question, context=text)
+            return {"answer": answer["answer"]}
+    # --- Task: Image Captioning ---
+    elif task == "captioning":
+        if content_type not in ["image/png", "image/jpeg"]:
+            raise HTTPException(status_code=400, detail="Only image files supported for captioning.")
+        image_path = save_temp_image(file)
+        caption = image_captioner(image_path)[0]["generated_text"]
+        os.remove(image_path)
+        return {"caption": caption}
+    # --- Task: Data Visualization ---
+    elif task == "visualization":
+        if content_type != "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
+            raise HTTPException(status_code=400, detail="Only Excel files supported for visualization.")
+        df = extract_data_from_excel(file_bytes)
+        if df.empty:
+            raise HTTPException(status_code=400, detail="No data found in Excel file.")
+        # Example visualization: correlation heatmap
+        numeric_df = df.select_dtypes(include="number")
+        if numeric_df.empty:
+            raise HTTPException(status_code=400, detail="No numeric data available for visualization.")
+        plt.figure(figsize=(10, 6))
+        sns.heatmap(numeric_df.corr(), annot=True, cmap="coolwarm")
+        viz_path = f"temp/viz_{uuid.uuid4().hex}.png"
+        plt.savefig(viz_path)
+        plt.close()
+        with open(viz_path, "rb") as img_file:
+            img_bytes = img_file.read()
+        os.remove(viz_path)
+        return JSONResponse(content={"image_bytes": list(img_bytes)})
+    else:
+        raise HTTPException(status_code=400, detail="Unsupported task.")