Spaces:
Sleeping
Sleeping
Update main.py
Browse files
main.py
CHANGED
@@ -728,30 +728,78 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
|
|
728 |
@app.post("/summarize")
|
729 |
@limiter.limit("5/minute")
|
730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
|
|
|
|
|
|
|
|
731 |
try:
|
|
|
732 |
file_ext, content = await process_uploaded_file(file)
|
|
|
|
|
733 |
text = extract_text(content, file_ext)
|
734 |
|
735 |
if not text.strip():
|
736 |
raise HTTPException(400, "No extractable text found")
|
737 |
|
738 |
-
# Clean
|
739 |
text = re.sub(r'\s+', ' ', text).strip()
|
740 |
-
chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
|
741 |
|
742 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
743 |
summarizer = get_summarizer()
|
|
|
|
|
744 |
summaries = []
|
745 |
for chunk in chunks:
|
746 |
-
|
747 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
748 |
|
749 |
-
return {"summary":
|
750 |
|
751 |
except HTTPException:
|
752 |
raise
|
753 |
except Exception as e:
|
754 |
-
logger.error(f"Summarization failed: {str(e)}")
|
755 |
raise HTTPException(500, "Document summarization failed")
|
756 |
@app.post("/qa")
|
757 |
@limiter.limit("5/minute")
|
|
|
728 |
@app.post("/summarize")
|
729 |
@limiter.limit("5/minute")
|
730 |
async def summarize_document(request: Request, file: UploadFile = File(...)):
|
731 |
+
"""
|
732 |
+
Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
|
733 |
+
Returns a concise summary of the document's main points.
|
734 |
+
"""
|
735 |
try:
|
736 |
+
# Use your existing file processing and validation
|
737 |
file_ext, content = await process_uploaded_file(file)
|
738 |
+
|
739 |
+
# Use your existing text extraction function
|
740 |
text = extract_text(content, file_ext)
|
741 |
|
742 |
if not text.strip():
|
743 |
raise HTTPException(400, "No extractable text found")
|
744 |
|
745 |
+
# Clean text (preserving your existing approach)
|
746 |
text = re.sub(r'\s+', ' ', text).strip()
|
|
|
747 |
|
748 |
+
# Improved chunking with sentence awareness
|
749 |
+
sentences = re.split(r'(?<=[.!?]) +', text)
|
750 |
+
chunks = []
|
751 |
+
current_chunk = ""
|
752 |
+
|
753 |
+
for sentence in sentences:
|
754 |
+
if len(current_chunk) + len(sentence) <= 1000:
|
755 |
+
current_chunk += " " + sentence
|
756 |
+
else:
|
757 |
+
chunks.append(current_chunk.strip())
|
758 |
+
current_chunk = sentence
|
759 |
+
if current_chunk:
|
760 |
+
chunks.append(current_chunk.strip())
|
761 |
+
|
762 |
+
# Get your cached summarizer
|
763 |
summarizer = get_summarizer()
|
764 |
+
|
765 |
+
# Summarize each chunk with error handling
|
766 |
summaries = []
|
767 |
for chunk in chunks:
|
768 |
+
try:
|
769 |
+
summary = summarizer(
|
770 |
+
chunk,
|
771 |
+
max_length=150,
|
772 |
+
min_length=50,
|
773 |
+
do_sample=False,
|
774 |
+
truncation=True
|
775 |
+
)[0]["summary_text"]
|
776 |
+
summaries.append(summary)
|
777 |
+
except Exception as chunk_error:
|
778 |
+
logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
|
779 |
+
# Fallback: include the first 3 sentences of the chunk
|
780 |
+
fallback = " ".join(chunk.split('.')[:3]) + "."
|
781 |
+
summaries.append(fallback)
|
782 |
+
|
783 |
+
# Combine and clean the final summary
|
784 |
+
combined_summary = " ".join(summaries)
|
785 |
+
combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
|
786 |
+
|
787 |
+
# If summary is too long, summarize it again
|
788 |
+
if len(combined_summary.split()) > 300:
|
789 |
+
combined_summary = summarizer(
|
790 |
+
combined_summary,
|
791 |
+
max_length=200,
|
792 |
+
min_length=100,
|
793 |
+
do_sample=False,
|
794 |
+
truncation=True
|
795 |
+
)[0]["summary_text"]
|
796 |
|
797 |
+
return {"summary": combined_summary}
|
798 |
|
799 |
except HTTPException:
|
800 |
raise
|
801 |
except Exception as e:
|
802 |
+
logger.error(f"Summarization failed: {str(e)}", exc_info=True)
|
803 |
raise HTTPException(500, "Document summarization failed")
|
804 |
@app.post("/qa")
|
805 |
@limiter.limit("5/minute")
|