Spaces:

chenguittiMaroua
/

asm-app

Sleeping

App Files Files Community

chenguittiMaroua commited on Apr 24

Commit

0c9d79d

verified ·

1 Parent(s): ea34500

Update main.py

Browse files

Files changed (1) hide show

main.py +55 -7

main.py CHANGED Viewed

@@ -728,30 +728,78 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
     try:
         file_ext, content = await process_uploaded_file(file)
         text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
-        # Clean and chunk text
         text = re.sub(r'\s+', ' ', text).strip()
-        chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
-        # Summarize each chunk
         summarizer = get_summarizer()
         summaries = []
         for chunk in chunks:
-            summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
-            summaries.append(summary)
-        return {"summary": " ".join(summaries)}
     except HTTPException:
         raise
     except Exception as e:
-        logger.error(f"Summarization failed: {str(e)}")
         raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")

 @app.post("/summarize")
 @limiter.limit("5/minute")
 async def summarize_document(request: Request, file: UploadFile = File(...)):
+    """
+    Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
+    Returns a concise summary of the document's main points.
+    """
     try:
+        # Use your existing file processing and validation
         file_ext, content = await process_uploaded_file(file)
+        # Use your existing text extraction function
         text = extract_text(content, file_ext)
         if not text.strip():
             raise HTTPException(400, "No extractable text found")
+        # Clean text (preserving your existing approach)
         text = re.sub(r'\s+', ' ', text).strip()
+        # Improved chunking with sentence awareness
+        sentences = re.split(r'(?<=[.!?]) +', text)
+        chunks = []
+        current_chunk = ""
+        for sentence in sentences:
+            if len(current_chunk) + len(sentence) <= 1000:
+                current_chunk += " " + sentence
+            else:
+                chunks.append(current_chunk.strip())
+                current_chunk = sentence
+        if current_chunk:
+            chunks.append(current_chunk.strip())
+        # Get your cached summarizer
         summarizer = get_summarizer()
+        # Summarize each chunk with error handling
         summaries = []
         for chunk in chunks:
+            try:
+                summary = summarizer(
+                    chunk,
+                    max_length=150,
+                    min_length=50,
+                    do_sample=False,
+                    truncation=True
+                )[0]["summary_text"]
+                summaries.append(summary)
+            except Exception as chunk_error:
+                logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
+                # Fallback: include the first 3 sentences of the chunk
+                fallback = " ".join(chunk.split('.')[:3]) + "."
+                summaries.append(fallback)
+        # Combine and clean the final summary
+        combined_summary = " ".join(summaries)
+        combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
+        # If summary is too long, summarize it again
+        if len(combined_summary.split()) > 300:
+            combined_summary = summarizer(
+                combined_summary,
+                max_length=200,
+                min_length=100,
+                do_sample=False,
+                truncation=True
+            )[0]["summary_text"]
+        return {"summary": combined_summary}
     except HTTPException:
         raise
     except Exception as e:
+        logger.error(f"Summarization failed: {str(e)}", exc_info=True)
         raise HTTPException(500, "Document summarization failed")
 @app.post("/qa")
 @limiter.limit("5/minute")