Update app.py
Browse files
app.py
CHANGED
@@ -15,6 +15,10 @@ import textwrap
|
|
15 |
# Device configuration
|
16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
|
|
|
|
|
|
|
|
|
18 |
# Translation function
|
19 |
@spaces.GPU(duration=120)
|
20 |
def translate(source_text, source_lang, target_lang, batch_size=16):
|
@@ -53,14 +57,10 @@ def split_text_into_sentences(text):
|
|
53 |
sentences = sentence_endings.split(text)
|
54 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
55 |
|
56 |
-
# Function to generate audio
|
57 |
@spaces.GPU(duration=120)
|
58 |
-
def
|
59 |
-
|
60 |
-
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
|
61 |
-
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
62 |
-
|
63 |
-
input_ids = tts_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
64 |
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
65 |
|
66 |
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
@@ -90,33 +90,40 @@ def update_target_lang_options(source_lang):
|
|
90 |
}
|
91 |
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
|
92 |
|
93 |
-
#
|
94 |
-
def
|
95 |
-
print("Extracting text from PDF...")
|
96 |
-
text = pdf_to_text(pdf_file.name)
|
97 |
-
|
98 |
-
# Translate if the translation checkbox is selected
|
99 |
-
if translate_checkbox:
|
100 |
-
print("Translating text...")
|
101 |
-
text = translate(text, source_lang, target_lang)
|
102 |
-
|
103 |
-
print("Splitting text into sentences...")
|
104 |
-
sentences = split_text_into_sentences(text)
|
105 |
audio_files = []
|
106 |
outputs = []
|
107 |
-
|
108 |
for i, sentence in enumerate(sentences):
|
109 |
print(f"Generating audio for sentence {i+1}...")
|
110 |
output_file_prefix = f"sentence_{i+1}"
|
111 |
-
audio_file =
|
112 |
audio_files.append(audio_file)
|
113 |
outputs.append((sentence, audio_file))
|
114 |
-
|
115 |
-
# Display each sentence and its corresponding audio immediately
|
116 |
print(f"Generated sentence: {sentence}")
|
117 |
gr.Markdown(f"**Sentence**: {sentence}")
|
118 |
gr.Audio(value=audio_file, label=sentence)
|
119 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
120 |
print("Combining all audio files...")
|
121 |
combined_output_file = "sentences_combined.wav"
|
122 |
combine_wav_files(combined_output_file, *audio_files)
|
@@ -142,7 +149,7 @@ with gr.Blocks() as demo:
|
|
142 |
output_group = gr.Group()
|
143 |
|
144 |
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
|
145 |
-
outputs, combined_output_file = process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description)
|
146 |
with output_group:
|
147 |
for sentence, audio_file in outputs:
|
148 |
gr.Markdown(f"**Sentence**: {sentence}")
|
|
|
15 |
# Device configuration
|
16 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
17 |
|
18 |
+
# Initialize models and tokenizers outside the functions
|
19 |
+
tts_model = ParlerTTSForConditionalGeneration.from_pretrained("parler-tts/parler-tts-large-v1").to(device)
|
20 |
+
tts_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-large-v1")
|
21 |
+
|
22 |
# Translation function
|
23 |
@spaces.GPU(duration=120)
|
24 |
def translate(source_text, source_lang, target_lang, batch_size=16):
|
|
|
57 |
sentences = sentence_endings.split(text)
|
58 |
return [sentence.strip() for sentence in sentences if sentence.strip()]
|
59 |
|
60 |
+
# Function to generate audio for a single sentence
|
61 |
@spaces.GPU(duration=120)
|
62 |
+
def generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer):
|
63 |
+
input_ids = tts_tokenizer(sentence, return_tensors="pt").input_ids.to(device)
|
|
|
|
|
|
|
|
|
64 |
prompt_input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(device)
|
65 |
|
66 |
generation = tts_model.generate(input_ids=input_ids, prompt_input_ids=prompt_input_ids)
|
|
|
90 |
}
|
91 |
return gr.update(choices=options.get(source_lang, []), value=options.get(source_lang, [])[0])
|
92 |
|
93 |
+
# Function to process sentences for audio generation
|
94 |
+
def process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
95 |
audio_files = []
|
96 |
outputs = []
|
97 |
+
|
98 |
for i, sentence in enumerate(sentences):
|
99 |
print(f"Generating audio for sentence {i+1}...")
|
100 |
output_file_prefix = f"sentence_{i+1}"
|
101 |
+
audio_file = generate_single_wav_from_text(sentence, description, output_file_prefix, tts_model, tts_tokenizer)
|
102 |
audio_files.append(audio_file)
|
103 |
outputs.append((sentence, audio_file))
|
104 |
+
|
|
|
105 |
print(f"Generated sentence: {sentence}")
|
106 |
gr.Markdown(f"**Sentence**: {sentence}")
|
107 |
gr.Audio(value=audio_file, label=sentence)
|
108 |
|
109 |
+
return outputs, audio_files
|
110 |
+
|
111 |
+
# Main Gradio function
|
112 |
+
def process_pdf(pdf_file, translate_checkbox, source_lang, target_lang, description, tts_model, tts_tokenizer):
|
113 |
+
print("Extracting text from PDF...")
|
114 |
+
text = pdf_to_text(pdf_file.name)
|
115 |
+
|
116 |
+
# Translate if translation checkbox is selected
|
117 |
+
if translate_checkbox:
|
118 |
+
print("Translating text...")
|
119 |
+
text = translate(text, source_lang, target_lang)
|
120 |
+
|
121 |
+
print("Splitting text into sentences...")
|
122 |
+
sentences = split_text_into_sentences(text)
|
123 |
+
|
124 |
+
# Process sentences for audio generation
|
125 |
+
outputs, audio_files = process_sentences_for_audio(sentences, description, tts_model, tts_tokenizer)
|
126 |
+
|
127 |
print("Combining all audio files...")
|
128 |
combined_output_file = "sentences_combined.wav"
|
129 |
combine_wav_files(combined_output_file, *audio_files)
|
|
|
149 |
output_group = gr.Group()
|
150 |
|
151 |
def handle_process(pdf_input, translate_checkbox, source_lang, target_lang, description):
|
152 |
+
outputs, combined_output_file = process_pdf(pdf_input, translate_checkbox, source_lang, target_lang, description, tts_model, tts_tokenizer)
|
153 |
with output_group:
|
154 |
for sentence, audio_file in outputs:
|
155 |
gr.Markdown(f"**Sentence**: {sentence}")
|