diginoron commited on
Commit
8c10b14
·
verified ·
1 Parent(s): c17b07a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +20 -19
app.py CHANGED
@@ -8,7 +8,11 @@ import nltk
8
  import os
9
 
10
  # Download NLTK data for sumy
11
- nltk.download('punkt')
 
 
 
 
12
 
13
  def extract_text_from_pdf(pdf_file):
14
  """
@@ -18,14 +22,16 @@ def extract_text_from_pdf(pdf_file):
18
  pdf_file: Uploaded PDF file.
19
 
20
  Returns:
21
- str: Extracted text from the PDF.
22
  """
23
  try:
24
  with pdfplumber.open(pdf_file) as pdf:
25
  text = ""
26
  for page in pdf.pages:
27
- text += page.extract_text() or ""
28
- return text if text else "No text could be extracted from the PDF."
 
 
29
  except Exception as e:
30
  return f"Error extracting text: {str(e)}"
31
 
@@ -38,23 +44,19 @@ def summarize_text(text, sentences_count=12):
38
  sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph).
39
 
40
  Returns:
41
- str: Summarized text.
42
  """
43
  try:
44
- # Initialize parser and tokenizer
 
45
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
46
  summarizer = LsaSummarizer()
47
-
48
- # Summarize to specified number of sentences
49
  summary = summarizer(parser.document, sentences_count)
50
-
51
- # Combine sentences and format into paragraphs (approx. 3 sentences per paragraph)
52
  summary_text = ""
53
  for i, sentence in enumerate(summary):
54
  summary_text += str(sentence) + " "
55
- if (i + 1) % 3 == 0: # Add paragraph break every 3 sentences
56
  summary_text += "\n\n"
57
-
58
  return summary_text.strip() if summary_text else "No summary generated."
59
  except Exception as e:
60
  return f"Error summarizing text: {str(e)}"
@@ -68,18 +70,18 @@ def pdf_to_speech(pdf_file, lang="en"):
68
  lang (str): Language code (default is 'en' for English).
69
 
70
  Returns:
71
- tuple: (Path to audio file, summarized text) or (error message, error message).
72
  """
73
  try:
74
  # Extract text from PDF
75
  text = extract_text_from_pdf(pdf_file)
76
  if "Error" in text:
77
- return text, text
78
 
79
- # Summarize text (approx. 12 sentences for 4 paragraphs)
80
  summarized_text = summarize_text(text, sentences_count=12)
81
- if "Error" in summarized_text:
82
- return summarized_text, summarized_text
83
 
84
  # Create gTTS object
85
  tts = gTTS(text=summarized_text, lang=lang, slow=False)
@@ -91,8 +93,7 @@ def pdf_to_speech(pdf_file, lang="en"):
91
  return output_file, summarized_text
92
 
93
  except Exception as e:
94
- error_msg = f"An error occurred: {str(e)}"
95
- return error_msg, error_msg
96
 
97
  # Define Gradio interface
98
  demo = gr.Interface(
 
8
  import os
9
 
10
  # Download NLTK data for sumy
11
+ try:
12
+ nltk.download('punkt')
13
+ nltk.download('punkt_tab')
14
+ except Exception as e:
15
+ print(f"Error downloading NLTK data: {str(e)}")
16
 
17
  def extract_text_from_pdf(pdf_file):
18
  """
 
22
  pdf_file: Uploaded PDF file.
23
 
24
  Returns:
25
+ str: Extracted text or error message.
26
  """
27
  try:
28
  with pdfplumber.open(pdf_file) as pdf:
29
  text = ""
30
  for page in pdf.pages:
31
+ page_text = page.extract_text()
32
+ if page_text:
33
+ text += page_text + " "
34
+ return text.strip() if text else "No text could be extracted from the PDF."
35
  except Exception as e:
36
  return f"Error extracting text: {str(e)}"
37
 
 
44
  sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph).
45
 
46
  Returns:
47
+ str: Summarized text or error message.
48
  """
49
  try:
50
+ if len(text.split()) < 50:
51
+ return "Text is too short to summarize."
52
  parser = PlaintextParser.from_string(text, Tokenizer("english"))
53
  summarizer = LsaSummarizer()
 
 
54
  summary = summarizer(parser.document, sentences_count)
 
 
55
  summary_text = ""
56
  for i, sentence in enumerate(summary):
57
  summary_text += str(sentence) + " "
58
+ if (i + 1) % 3 == 0:
59
  summary_text += "\n\n"
 
60
  return summary_text.strip() if summary_text else "No summary generated."
61
  except Exception as e:
62
  return f"Error summarizing text: {str(e)}"
 
70
  lang (str): Language code (default is 'en' for English).
71
 
72
  Returns:
73
+ tuple: (Path to audio file or None, summarized text or error message).
74
  """
75
  try:
76
  # Extract text from PDF
77
  text = extract_text_from_pdf(pdf_file)
78
  if "Error" in text:
79
+ return None, text
80
 
81
+ # Summarize text
82
  summarized_text = summarize_text(text, sentences_count=12)
83
+ if "Error" in summarized_text or "too short" in summarized_text:
84
+ return None, summarized_text
85
 
86
  # Create gTTS object
87
  tts = gTTS(text=summarized_text, lang=lang, slow=False)
 
93
  return output_file, summarized_text
94
 
95
  except Exception as e:
96
+ return None, f"An error occurred: {str(e)}"
 
97
 
98
  # Define Gradio interface
99
  demo = gr.Interface(