diginoron commited on
Commit
cfeeb7f
·
verified ·
1 Parent(s): 36aba49

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +57 -11
app.py CHANGED
@@ -1,8 +1,15 @@
1
  import gradio as gr
2
  from gtts import gTTS
3
  import pdfplumber
 
 
 
 
4
  import os
5
 
 
 
 
6
  def extract_text_from_pdf(pdf_file):
7
  """
8
  Extract text from a PDF file using pdfplumber.
@@ -22,45 +29,84 @@ def extract_text_from_pdf(pdf_file):
22
  except Exception as e:
23
  return f"Error extracting text: {str(e)}"
24
 
25
- def text_to_speech(pdf_file, lang="en"):
26
  """
27
- Convert text from a PDF to speech using gTTS and return the audio file path.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
  Args:
30
  pdf_file: Uploaded PDF file.
31
  lang (str): Language code (default is 'en' for English).
32
 
33
  Returns:
34
- str: Path to the generated audio file or error message.
35
  """
36
  try:
37
  # Extract text from PDF
38
  text = extract_text_from_pdf(pdf_file)
39
  if "Error" in text:
40
- return text
 
 
 
 
 
41
 
42
  # Create gTTS object
43
- tts = gTTS(text=text, lang=lang, slow=False)
44
 
45
  # Save the audio file
46
  output_file = "output.mp3"
47
  tts.save(output_file)
48
 
49
- return output_file
50
 
51
  except Exception as e:
52
- return f"An error occurred: {str(e)}"
 
53
 
54
  # Define Gradio interface
55
  demo = gr.Interface(
56
- fn=text_to_speech,
57
  inputs=[
58
  gr.File(label="Upload a PDF file", file_types=[".pdf"]),
59
  gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en")
60
  ],
61
- outputs=gr.Audio(label="Generated Speech"),
62
- title="PDF to Speech with gTTS",
63
- description="Upload a PDF file, select a language, and generate speech from the extracted text."
 
 
 
64
  )
65
 
66
  # Launch the app
 
1
  import gradio as gr
2
  from gtts import gTTS
3
  import pdfplumber
4
+ from sumy.parsers.plaintext import PlaintextParser
5
+ from sumy.nlp.tokenizers import Tokenizer
6
+ from sumy.summarizers.lsa import LsaSummarizer
7
+ import nltk
8
  import os
9
 
10
+ # Download NLTK data for sumy
11
+ nltk.download('punkt')
12
+
13
  def extract_text_from_pdf(pdf_file):
14
  """
15
  Extract text from a PDF file using pdfplumber.
 
29
  except Exception as e:
30
  return f"Error extracting text: {str(e)}"
31
 
32
+ def summarize_text(text, sentences_count=12):
33
  """
34
+ Summarize text to approximately four paragraphs using sumy LSA summarizer.
35
+
36
+ Args:
37
+ text (str): Text to summarize.
38
+ sentences_count (int): Number of sentences in summary (approx. 3 sentences per paragraph).
39
+
40
+ Returns:
41
+ str: Summarized text.
42
+ """
43
+ try:
44
+ # Initialize parser and tokenizer
45
+ parser = PlaintextParser.from_string(text, Tokenizer("english"))
46
+ summarizer = LsaSummarizer()
47
+
48
+ # Summarize to specified number of sentences
49
+ summary = summarizer(parser.document, sentences_count)
50
+
51
+ # Combine sentences and format into paragraphs (approx. 3 sentences per paragraph)
52
+ summary_text = ""
53
+ for i, sentence in enumerate(summary):
54
+ summary_text += str(sentence) + " "
55
+ if (i + 1) % 3 == 0: # Add paragraph break every 3 sentences
56
+ summary_text += "\n\n"
57
+
58
+ return summary_text.strip() if summary_text else "No summary generated."
59
+ except Exception as e:
60
+ return f"Error summarizing text: {str(e)}"
61
+
62
+ def pdf_to_speech(pdf_file, lang="en"):
63
+ """
64
+ Convert text from a PDF to summarized speech using gTTS.
65
 
66
  Args:
67
  pdf_file: Uploaded PDF file.
68
  lang (str): Language code (default is 'en' for English).
69
 
70
  Returns:
71
+ tuple: (Path to audio file, summarized text) or (error message, error message).
72
  """
73
  try:
74
  # Extract text from PDF
75
  text = extract_text_from_pdf(pdf_file)
76
  if "Error" in text:
77
+ return text, text
78
+
79
+ # Summarize text (approx. 12 sentences for 4 paragraphs)
80
+ summarized_text = summarize_text(text, sentences_count=12)
81
+ if "Error" in summarized_text:
82
+ return summarized_text, summarized_text
83
 
84
  # Create gTTS object
85
+ tts = gTTS(text=summarized_text, lang=lang, slow=False)
86
 
87
  # Save the audio file
88
  output_file = "output.mp3"
89
  tts.save(output_file)
90
 
91
+ return output_file, summarized_text
92
 
93
  except Exception as e:
94
+ error_msg = f"An error occurred: {str(e)}"
95
+ return error_msg, error_msg
96
 
97
  # Define Gradio interface
98
  demo = gr.Interface(
99
+ fn=pdf_to_speech,
100
  inputs=[
101
  gr.File(label="Upload a PDF file", file_types=[".pdf"]),
102
  gr.Dropdown(choices=["en", "es", "fr"], label="Select Language", value="en")
103
  ],
104
+ outputs=[
105
+ gr.Audio(label="Generated Speech"),
106
+ gr.Textbox(label="Summarized Text")
107
+ ],
108
+ title="PDF Summary to Speech",
109
+ description="Upload an English PDF file, select a language, and generate speech from a summarized version (approx. 4 paragraphs). The summarized text is also displayed."
110
  )
111
 
112
  # Launch the app