Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 26

Commit

94bcc5a

verified ·

1 Parent(s): dba5d7e

Update main.py

Browse files

Files changed (1) hide show

main.py +85 -70

main.py CHANGED Viewed

@@ -127,30 +127,24 @@ def get_summarizer():
   #  if qa_model is None:
    #     qa_model= pipe = pipeline("question-answering", model="deepset/roberta-base-squad2")
     #return qa_model
-from transformers import RagTokenizer, RagTokenForGeneration, pipeline
-qa_model = None
-rag_model = None
 def get_qa_model():
     global qa_model
     if qa_model is None:
-        qa_model = pipeline(
-            "question-answering",
-            model="deepset/roberta-base-squad2"
-        )
     return qa_model
-def get_rag_model():
-    global rag_model
-    if rag_model is None:
-        rag_model = pipeline(
-            "text-generation",
-            model="facebook/rag-token-nq",
-            tokenizer="facebook/rag-token-nq"
-        )
-    return rag_model
@@ -161,7 +155,7 @@ def get_image_captioner():
     return image_captioner
 async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
-    """Validate and process uploaded file with special handling for each type"""
     if not file.filename:
         raise HTTPException(400, "No filename provided")
@@ -173,7 +167,6 @@ async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
     if len(content) > MAX_FILE_SIZE:
         raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
-    # Special validation for PDFs
     if file_ext == "pdf":
         try:
             with fitz.open(stream=content, filetype="pdf") as doc:
@@ -186,14 +179,13 @@ async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
             logger.error(f"PDF validation failed: {str(e)}")
             raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")
-    await file.seek(0)  # Reset file pointer for processing
     return file_ext, content
 def extract_text(content: bytes, file_ext: str) -> str:
-    """Extract text from various file formats with enhanced Excel support"""
     try:
         if file_ext == "txt":
-            # Decode plain text (handle encoding issues)
             return content.decode("utf-8", errors="replace").strip()
         if file_ext == "docx":
@@ -201,7 +193,6 @@ def extract_text(content: bytes, file_ext: str) -> str:
             return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
         elif file_ext in {"xlsx", "xls"}:
-            # Improved Excel handling with better NaN and date support
             df = pd.read_excel(
                 io.BytesIO(content),
                 sheet_name=None,
@@ -214,12 +205,9 @@ def extract_text(content: bytes, file_ext: str) -> str:
             all_text = []
             for sheet_name, sheet_data in df.items():
                 sheet_text = []
-                # Convert all data to string and handle special types
                 for column in sheet_data.columns:
-                    # Handle datetime columns
                     if pd.api.types.is_datetime64_any_dtype(sheet_data[column]):
                         sheet_data[column] = sheet_data[column].dt.strftime('%Y-%m-%d %H:%M:%S')
-                    # Convert to string and clean
                     col_text = sheet_data[column].astype(str).replace(['nan', 'None', 'NaT'], '').tolist()
                     sheet_text.extend([x for x in col_text if x.strip()])
@@ -241,14 +229,12 @@ def extract_text(content: bytes, file_ext: str) -> str:
             return "\n".join(page.get_text("text") for page in pdf)
         elif file_ext in {"jpg", "jpeg", "png"}:
-            # First try OCR
             try:
                 image = Image.open(io.BytesIO(content))
                 text = pytesseract.image_to_string(image, config='--psm 6')
                 if text.strip():
                     return text
-                # If OCR fails, try image captioning
                 captioner = get_image_captioner()
                 result = captioner(image)
                 return result[0]['generated_text']
@@ -260,6 +246,19 @@ def extract_text(content: bytes, file_ext: str) -> str:
         logger.error(f"Text extraction failed for {file_ext}: {str(e)}", exc_info=True)
         raise HTTPException(422, f"Failed to extract text from {file_ext} file: {str(e)}")
 # Visualization Models
 class VisualizationRequest(BaseModel):
     chart_type: str
@@ -833,61 +832,77 @@ async def summarize_document(request: Request, file: UploadFile = File(...)):
 from typing import Optional
 @app.post("/qa")
-@limiter.limit("5/minute")
 async def question_answering(
     request: Request,
-    file: Optional[UploadFile] = File(None),  # Make file optional
     question: str = Form(...),
-    language: str = Form("fr")
 ):
-    # Validate question
-    if not question.strip():
-        raise HTTPException(400, "Question cannot be empty")
-    # Check if the question is about the document
-    is_doc_question = any(
-        kw in question.lower()
-        for kw in ["document", "file", "text", "this pdf", "this doc"]
-    )
-    # (A) If file is provided and question is about it → Document QA
-    if file and is_doc_question:
-        try:
-            file_ext, content = await process_uploaded_file(file)
-            text = extract_text(content, file_ext)
-            text = re.sub(r'\s+', ' ', text).strip()[:5000]
-            qa = get_qa_model()
-            result = qa(question=question, context=text[:3000])
-            return {
-                "question": question,
-                "answer": result["answer"],
-                "confidence": result["score"],
-                "source": "document",
-                "language": language
-            }
-        except Exception as e:
-            logger.error(f"Doc QA failed: {str(e)}")
-            raise HTTPException(500, "Failed to analyze document")
-    # (B) If no file or general question → Open-domain QA (RAG)
-    else:
         try:
-            rag = get_rag_model()
-            answer = rag(question)[0]["generated_text"]
-            return {
                 "question": question,
-                "answer": answer,
-                "confidence": 0.8,  # RAG doesn't return scores
                 "source": "general knowledge",
                 "language": language
-            }
         except Exception as e:
-            logger.error(f"RAG failed: {str(e)}")
-            raise HTTPException(500, "Failed to fetch general answer")

   #  if qa_model is None:
    #     qa_model= pipe = pipeline("question-answering", model="deepset/roberta-base-squad2")
     #return qa_model
 def get_qa_model():
     global qa_model
     if qa_model is None:
+        try:
+            qa_model = pipeline(
+                "text2text-generation",
+                model="google/flan-t5-base",
+                device=0 if torch.cuda.is_available() else -1
+            )
+        except Exception as e:
+            logger.error(f"Failed to load QA model: {str(e)}")
+            raise HTTPException(500, "Failed to initialize QA system")
     return qa_model
     return image_captioner
 async def process_uploaded_file(file: UploadFile) -> Tuple[str, bytes]:
+    """Your existing file processing function"""
     if not file.filename:
         raise HTTPException(400, "No filename provided")
     if len(content) > MAX_FILE_SIZE:
         raise HTTPException(413, f"File too large. Max size: {MAX_FILE_SIZE//1024//1024}MB")
     if file_ext == "pdf":
         try:
             with fitz.open(stream=content, filetype="pdf") as doc:
             logger.error(f"PDF validation failed: {str(e)}")
             raise HTTPException(422, detail=f"Invalid PDF file: {str(e)}")
+    await file.seek(0)
     return file_ext, content
 def extract_text(content: bytes, file_ext: str) -> str:
+    """Your existing text extraction function"""
     try:
         if file_ext == "txt":
             return content.decode("utf-8", errors="replace").strip()
         if file_ext == "docx":
             return "\n".join(para.text for para in doc.paragraphs if para.text.strip())
         elif file_ext in {"xlsx", "xls"}:
             df = pd.read_excel(
                 io.BytesIO(content),
                 sheet_name=None,
             all_text = []
             for sheet_name, sheet_data in df.items():
                 sheet_text = []
                 for column in sheet_data.columns:
                     if pd.api.types.is_datetime64_any_dtype(sheet_data[column]):
                         sheet_data[column] = sheet_data[column].dt.strftime('%Y-%m-%d %H:%M:%S')
                     col_text = sheet_data[column].astype(str).replace(['nan', 'None', 'NaT'], '').tolist()
                     sheet_text.extend([x for x in col_text if x.strip()])
             return "\n".join(page.get_text("text") for page in pdf)
         elif file_ext in {"jpg", "jpeg", "png"}:
             try:
                 image = Image.open(io.BytesIO(content))
                 text = pytesseract.image_to_string(image, config='--psm 6')
                 if text.strip():
                     return text
                 captioner = get_image_captioner()
                 result = captioner(image)
                 return result[0]['generated_text']
         logger.error(f"Text extraction failed for {file_ext}: {str(e)}", exc_info=True)
         raise HTTPException(422, f"Failed to extract text from {file_ext} file: {str(e)}")
 # Visualization Models
 class VisualizationRequest(BaseModel):
     chart_type: str
 from typing import Optional
 @app.post("/qa")
 async def question_answering(
     request: Request,
     question: str = Form(...),
+    file: Optional[UploadFile] = File(None),
+    language: str = Form("en")
 ):
+    """
+    Enhanced QA endpoint that:
+    - Processes uploaded files using your existing functions
+    - Answers questions using FLAN-T5
+    - Handles both document and general knowledge questions
+    """
+    try:
+        # Validate question
+        if not question.strip():
+            raise HTTPException(400, "Question cannot be empty")
+        qa_pipeline = get_qa_model()
+        # Case 1: Document QA (when file is provided)
+        if file:
+            try:
+                file_ext, content = await process_uploaded_file(file)
+                text = extract_text(content, file_ext)
+                # Clean and truncate text
+                clean_text = re.sub(r'\s+', ' ', text).strip()[:5000]
+                # Format for FLAN-T5 (combine question and context)
+                input_text = f"Answer this question based on the given context. Question: {question} Context: {clean_text}"
+                result = qa_pipeline(input_text, max_length=200)
+                return JSONResponse({
+                    "question": question,
+                    "answer": result[0]["generated_text"],
+                    "source": "document",
+                    "language": language,
+                    "file_type": file_ext
+                })
+            except HTTPException:
+                raise
+            except Exception as e:
+                logger.error(f"Document QA failed: {str(e)}")
+                raise HTTPException(500, "Failed to analyze document")
+        # Case 2: General QA (no file provided)
         try:
+            input_text = f"Answer this question: {question}"
+            result = qa_pipeline(input_text, max_length=200)
+            return JSONResponse({
                 "question": question,
+                "answer": result[0]["generated_text"],
                 "source": "general knowledge",
                 "language": language
+            })
         except Exception as e:
+            logger.error(f"General QA failed: {str(e)}")
+            raise HTTPException(500, "Failed to generate answer")
+    except HTTPException:
+        raise
+    except Exception as e:
+        logger.critical(f"Unexpected error: {str(e)}")
+        raise HTTPException(500, "Internal server error")