emirhanbilgic commited on
Commit
88b4f72
·
verified ·
1 Parent(s): 6441d5c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -24
app.py CHANGED
@@ -15,6 +15,10 @@ import textwrap
15
  # Device configuration
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
 
 
 
 
 
18
  # Translation function
19
  @spaces.GPU(duration=120)
20
  def translate(source_text, source_lang, target_lang, batch_size=16):
@@ -53,14 +57,10 @@ def split_text_into_sentences(text):
53
  sentences = sentence_endings.split(text)
54
  return [sentence.strip() for sentence in sentences if sentence.strip()]
55
 
56
- # Function to generate audio from text
57
  @spaces.GPU(duration=120)
58
- def generate_wav_from_text(prompt, description, output_file_prefix):
59
- # Load TTS model and tokenizer
60
- tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
61
- tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
62
-
63
- input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
64
  prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
65
 
66
  generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
@@ -90,33 +90,40 @@ def update_target_lang_options(source_lang):
90
  }
91
  return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
92
 
93
- # Main Gradio function
94
- def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description):
95
- print("Extracting text from PDF...")
96
- text = pdf_to_text(pdf_file.name)
97
-
98
- # Translate if the translation checkbox is selected
99
- if translate_checkbox:
100
- print("Translating text...")
101
- text = translate(text, source_lang, target_lang)
102
-
103
- print("Splitting text into sentences...")
104
- sentences = split_text_into_sentences(text)
105
  audio_files = []
106
  outputs = []
107
-
108
  for i, sentence in enumerate(sentences):
109
  print(f"Generating audio for sentence {i+1}...")
110
  output_file_prefix = f"sentence_{i+1}"
111
- audio_file = generate_wav_from_text(sentence, description, output_file_prefix)
112
  audio_files.append(audio_file)
113
  outputs.append((sentence, audio_file))
114
-
115
- # Display each sentence and its corresponding audio immediately
116
  print(f"Generated sentence: {sentence}")
117
  gr.Markdown(f"**Sentence**: {sentence}")
118
  gr.Audio(value=audio_file, label=sentence)
119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
120
  print("Combining all audio files...")
121
  combined_output_file = "sentences_combined.wav"
122
  combine_wav_files(combined_output_file, *audio_files)
@@ -142,7 +149,7 @@ with gr.Blocks() as demo:
142
  output_group = gr.Group()
143
 
144
  def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
145
- outputs, combined_output_file = process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
146
  with output_group:
147
  for sentence, audio_file in outputs:
148
  gr.Markdown(f"**Sentence**: {sentence}")
 
15
  # Device configuration
16
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
17
 
18
+ # Initialize models and tokenizers outside the functions
19
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
20
+ tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
21
+
22
  # Translation function
23
  @spaces.GPU(duration=120)
24
  def translate(source_text, source_lang, target_lang, batch_size=16):
 
57
  sentences = sentence_endings.split(text)
58
  return [sentence.strip() for sentence in sentences if sentence.strip()]
59
 
60
+ # Function to generate audio for a single sentence
61
  @spaces.GPU(duration=120)
62
+ def generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer):
63
+ input_ids = tts_tokenizer(sentence, return_tensors="pt").input_ids.to(device)
 
 
 
 
64
  prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
65
 
66
  generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
 
90
  }
91
  return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
92
 
93
+ # Function to process sentences for audio generation
94
+ def process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
 
 
 
 
 
 
 
 
 
 
95
  audio_files = []
96
  outputs = []
97
+
98
  for i, sentence in enumerate(sentences):
99
  print(f"Generating audio for sentence {i+1}...")
100
  output_file_prefix = f"sentence_{i+1}"
101
+ audio_file = generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer)
102
  audio_files.append(audio_file)
103
  outputs.append((sentence, audio_file))
104
+
 
105
  print(f"Generated sentence: {sentence}")
106
  gr.Markdown(f"**Sentence**: {sentence}")
107
  gr.Audio(value=audio_file, label=sentence)
108
 
109
+ return outputs, audio_files
110
+
111
+ # Main Gradio function
112
+ def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description, tts_model, tts_tokenizer):
113
+ print("Extracting text from PDF...")
114
+ text = pdf_to_text(pdf_file.name)
115
+
116
+ # Translate if translation checkbox is selected
117
+ if translate_checkbox:
118
+ print("Translating text...")
119
+ text = translate(text, source_lang, target_lang)
120
+
121
+ print("Splitting text into sentences...")
122
+ sentences = split_text_into_sentences(text)
123
+
124
+ # Process sentences for audio generation
125
+ outputs, audio_files = process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer)
126
+
127
  print("Combining all audio files...")
128
  combined_output_file = "sentences_combined.wav"
129
  combine_wav_files(combined_output_file, *audio_files)
 
149
  output_group = gr.Group()
150
 
151
  def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
152
+ outputs, combined_output_file = process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description, tts_model, tts_tokenizer)
153
  with output_group:
154
  for sentence, audio_file in outputs:
155
  gr.Markdown(f"**Sentence**: {sentence}")