chenguittiMaroua commited on
Commit
0c9d79d
·
verified ·
1 Parent(s): ea34500

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +55 -7
main.py CHANGED
@@ -728,30 +728,78 @@ def generate_dynamic_visualization_code(df: pd.DataFrame, request: Visualization
728
  @app.post("/summarize")
729
  @limiter.limit("5/minute")
730
  async def summarize_document(request: Request, file: UploadFile = File(...)):
 
 
 
 
731
  try:
 
732
  file_ext, content = await process_uploaded_file(file)
 
 
733
  text = extract_text(content, file_ext)
734
 
735
  if not text.strip():
736
  raise HTTPException(400, "No extractable text found")
737
 
738
- # Clean and chunk text
739
  text = re.sub(r'\s+', ' ', text).strip()
740
- chunks = [text[i:i+1000] for i in range(0, len(text), 1000)]
741
 
742
- # Summarize each chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
743
  summarizer = get_summarizer()
 
 
744
  summaries = []
745
  for chunk in chunks:
746
- summary = summarizer(chunk, max_length=150, min_length=50, do_sample=False)[0]["summary_text"]
747
- summaries.append(summary)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
748
 
749
- return {"summary": " ".join(summaries)}
750
 
751
  except HTTPException:
752
  raise
753
  except Exception as e:
754
- logger.error(f"Summarization failed: {str(e)}")
755
  raise HTTPException(500, "Document summarization failed")
756
  @app.post("/qa")
757
  @limiter.limit("5/minute")
 
728
  @app.post("/summarize")
729
  @limiter.limit("5/minute")
730
  async def summarize_document(request: Request, file: UploadFile = File(...)):
731
+ """
732
+ Summarize content from various file types (PDF, Word, Excel, PowerPoint, Images)
733
+ Returns a concise summary of the document's main points.
734
+ """
735
  try:
736
+ # Use your existing file processing and validation
737
  file_ext, content = await process_uploaded_file(file)
738
+
739
+ # Use your existing text extraction function
740
  text = extract_text(content, file_ext)
741
 
742
  if not text.strip():
743
  raise HTTPException(400, "No extractable text found")
744
 
745
+ # Clean text (preserving your existing approach)
746
  text = re.sub(r'\s+', ' ', text).strip()
 
747
 
748
+ # Improved chunking with sentence awareness
749
+ sentences = re.split(r'(?<=[.!?]) +', text)
750
+ chunks = []
751
+ current_chunk = ""
752
+
753
+ for sentence in sentences:
754
+ if len(current_chunk) + len(sentence) <= 1000:
755
+ current_chunk += " " + sentence
756
+ else:
757
+ chunks.append(current_chunk.strip())
758
+ current_chunk = sentence
759
+ if current_chunk:
760
+ chunks.append(current_chunk.strip())
761
+
762
+ # Get your cached summarizer
763
  summarizer = get_summarizer()
764
+
765
+ # Summarize each chunk with error handling
766
  summaries = []
767
  for chunk in chunks:
768
+ try:
769
+ summary = summarizer(
770
+ chunk,
771
+ max_length=150,
772
+ min_length=50,
773
+ do_sample=False,
774
+ truncation=True
775
+ )[0]["summary_text"]
776
+ summaries.append(summary)
777
+ except Exception as chunk_error:
778
+ logger.warning(f"Failed to summarize chunk: {str(chunk_error)}")
779
+ # Fallback: include the first 3 sentences of the chunk
780
+ fallback = " ".join(chunk.split('.')[:3]) + "."
781
+ summaries.append(fallback)
782
+
783
+ # Combine and clean the final summary
784
+ combined_summary = " ".join(summaries)
785
+ combined_summary = re.sub(r'\s+', ' ', combined_summary).strip()
786
+
787
+ # If summary is too long, summarize it again
788
+ if len(combined_summary.split()) > 300:
789
+ combined_summary = summarizer(
790
+ combined_summary,
791
+ max_length=200,
792
+ min_length=100,
793
+ do_sample=False,
794
+ truncation=True
795
+ )[0]["summary_text"]
796
 
797
+ return {"summary": combined_summary}
798
 
799
  except HTTPException:
800
  raise
801
  except Exception as e:
802
+ logger.error(f"Summarization failed: {str(e)}", exc_info=True)
803
  raise HTTPException(500, "Document summarization failed")
804
  @app.post("/qa")
805
  @limiter.limit("5/minute")