Update app.py
Browse files
app.py
CHANGED
@@ -9,14 +9,19 @@ import zipfile
|
|
9 |
from google import genai
|
10 |
from google.genai import types
|
11 |
|
|
|
|
|
|
|
|
|
12 |
try:
|
13 |
from pydub import AudioSegment
|
14 |
PYDUB_AVAILABLE = True
|
15 |
except ImportError:
|
16 |
PYDUB_AVAILABLE = False
|
17 |
-
print("β οΈ pydub is not available. Audio
|
|
|
18 |
|
19 |
-
# --- Constants
|
20 |
SPEAKER_VOICES = [
|
21 |
"Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
|
22 |
"Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
|
@@ -26,51 +31,61 @@ SPEAKER_VOICES = [
|
|
26 |
]
|
27 |
MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
|
28 |
|
29 |
-
# --- Helper functions
|
30 |
def save_binary_file(file_name, data):
|
31 |
-
# Ensure we are writing to a path Gradio can access (usually current dir is fine for temp files)
|
32 |
abs_file_name = os.path.abspath(file_name)
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
|
|
|
|
|
|
|
|
37 |
|
38 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
39 |
parameters = parse_audio_mime_type(mime_type)
|
40 |
bits_per_sample = parameters["bits_per_sample"]
|
41 |
sample_rate = parameters["rate"]
|
42 |
-
num_channels = 1
|
43 |
data_size = len(audio_data)
|
44 |
bytes_per_sample = bits_per_sample // 8
|
45 |
block_align = num_channels * bytes_per_sample
|
46 |
byte_rate = sample_rate * block_align
|
47 |
-
chunk_size = 36 + data_size
|
|
|
48 |
header = struct.pack(
|
49 |
"<4sI4s4sIHHIIHH4sI",
|
50 |
-
b"RIFF", chunk_size, b"WAVE", b"fmt ", 16,
|
51 |
-
|
|
|
|
|
52 |
)
|
53 |
return header + audio_data
|
54 |
|
55 |
def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
|
56 |
-
bits_per_sample = 16
|
57 |
-
rate = 24000
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
|
|
|
|
|
|
|
|
|
|
70 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
71 |
|
72 |
def load_text_from_gr_file(file_obj):
|
73 |
-
"""Load text from a Gradio file object."""
|
74 |
if file_obj is None:
|
75 |
return "", "No file provided for text input."
|
76 |
try:
|
@@ -78,7 +93,6 @@ def load_text_from_gr_file(file_obj):
|
|
78 |
content = f.read().strip()
|
79 |
if not content:
|
80 |
return "", "Text file is empty."
|
81 |
-
print(f"π Text loaded from file: {len(content)} characters")
|
82 |
return content, f"Successfully loaded {len(content)} chars from {os.path.basename(file_obj.name)}."
|
83 |
except Exception as e:
|
84 |
return "", f"Error reading text file: {e}"
|
@@ -88,25 +102,39 @@ def smart_text_split(text, max_size=3800):
|
|
88 |
return [text]
|
89 |
chunks = []
|
90 |
current_chunk = ""
|
91 |
-
sentences = re.split(r'(?<=[.!?])\s+', text)
|
92 |
for sentence in sentences:
|
93 |
-
if
|
94 |
-
|
|
|
|
|
95 |
chunks.append(current_chunk.strip())
|
96 |
-
|
|
|
|
|
97 |
if len(sentence) > max_size:
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
106 |
current_chunk = sentence
|
107 |
-
else:
|
108 |
current_chunk += (" " if current_chunk else "") + sentence
|
109 |
-
|
|
|
110 |
chunks.append(current_chunk.strip())
|
111 |
return chunks
|
112 |
|
@@ -114,24 +142,32 @@ def smart_text_split(text, max_size=3800):
|
|
114 |
def merge_audio_files_func(file_paths, output_path):
|
115 |
if not PYDUB_AVAILABLE:
|
116 |
return False, "pydub is not available. Cannot merge files.", None
|
|
|
|
|
117 |
try:
|
118 |
-
print(f"π Merging {len(file_paths)} audio files...")
|
119 |
combined = AudioSegment.empty()
|
120 |
for i, file_path in enumerate(file_paths):
|
121 |
if os.path.exists(file_path):
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
combined +=
|
|
|
|
|
|
|
|
|
|
|
127 |
else:
|
128 |
-
print(f"β οΈ File not found: {file_path}")
|
|
|
|
|
129 |
|
130 |
abs_output_path = os.path.abspath(output_path)
|
131 |
combined.export(abs_output_path, format="wav")
|
132 |
-
print(f"β
Merged file saved: {abs_output_path}")
|
133 |
return True, f"Merged file saved: {os.path.basename(abs_output_path)}", abs_output_path
|
134 |
except Exception as e:
|
|
|
135 |
return False, f"Error merging files: {e}", None
|
136 |
|
137 |
def create_zip_file(file_paths, zip_name):
|
@@ -141,30 +177,37 @@ def create_zip_file(file_paths, zip_name):
|
|
141 |
for file_path in file_paths:
|
142 |
if os.path.exists(file_path):
|
143 |
zipf.write(file_path, os.path.basename(file_path))
|
144 |
-
print(f"π¦ ZIP file created: {abs_zip_name}")
|
145 |
return True, f"ZIP file created: {os.path.basename(abs_zip_name)}", abs_zip_name
|
146 |
except Exception as e:
|
147 |
return False, f"Error creating ZIP file: {e}", None
|
148 |
|
149 |
-
# --- Main generation function (modified for Gradio) ---
|
150 |
def generate_audio_for_gradio(
|
151 |
-
|
152 |
-
use_file_input_checkbox, text_file_obj,
|
153 |
speech_prompt_input, text_to_speak_input,
|
154 |
max_chunk_slider, sleep_slider, temperature_slider,
|
155 |
model_dropdown, speaker_dropdown, output_filename_base_input,
|
156 |
-
merge_checkbox, delete_partials_checkbox
|
|
|
|
|
157 |
):
|
158 |
status_messages = []
|
159 |
-
print("π Starting Text-to-Speech process...")
|
160 |
status_messages.append("π Starting Text-to-Speech process...")
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
165 |
return None, None, "\n".join(status_messages)
|
166 |
-
|
167 |
-
|
|
|
168 |
|
169 |
# 2. Determine Text Input
|
170 |
actual_text_input = ""
|
@@ -187,6 +230,7 @@ def generate_audio_for_gradio(
|
|
187 |
# 3. Initialize GenAI Client
|
188 |
try:
|
189 |
status_messages.append("π οΈ Initializing Gemini client...")
|
|
|
190 |
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
|
191 |
status_messages.append("β
Gemini client initialized.")
|
192 |
except Exception as e:
|
@@ -196,22 +240,25 @@ def generate_audio_for_gradio(
|
|
196 |
# 4. Split text
|
197 |
text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
|
198 |
status_messages.append(f"π Text split into {len(text_chunks)} chunk(s).")
|
199 |
-
for i,
|
200 |
-
status_messages.append(f" π Chunk {i+1}: {len(
|
201 |
|
202 |
# 5. Generate audio for each chunk
|
203 |
generated_audio_files = []
|
204 |
-
# Create a unique temp directory for this run to avoid conflicts
|
205 |
run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
|
206 |
temp_output_dir = f"temp_audio_{run_id}"
|
207 |
os.makedirs(temp_output_dir, exist_ok=True)
|
208 |
-
|
209 |
-
output_base_name_safe = re.sub(r'\W+', '_', output_filename_base_input) # Sanitize filename
|
210 |
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
|
|
|
|
|
|
|
|
|
|
215 |
generate_content_config = types.GenerateContentConfig(
|
216 |
temperature=float(temperature_slider),
|
217 |
response_modalities=["audio"],
|
@@ -223,244 +270,281 @@ def generate_audio_for_gradio(
|
|
223 |
)
|
224 |
try:
|
225 |
chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
|
226 |
-
# Save chunks in the temporary directory
|
227 |
chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
231 |
):
|
232 |
-
if (
|
233 |
-
|
234 |
-
|
235 |
|
236 |
-
inline_data =
|
237 |
data_buffer = inline_data.data
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
|
245 |
-
|
246 |
-
|
247 |
-
|
248 |
-
|
249 |
-
|
250 |
-
|
251 |
-
|
252 |
-
|
253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
254 |
generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
|
255 |
-
|
256 |
-
|
257 |
-
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
262 |
except Exception as e:
|
263 |
-
status_messages.append(f"β Error generating chunk {i+1}: {e}")
|
264 |
-
|
265 |
-
|
|
|
266 |
|
267 |
-
if i <
|
268 |
status_messages.append(f"β±οΈ Waiting {sleep_slider}s...")
|
269 |
time.sleep(float(sleep_slider))
|
270 |
|
|
|
271 |
# 6. Handle output files
|
272 |
if not generated_audio_files:
|
273 |
-
status_messages.append("β No audio files were generated!")
|
274 |
-
|
|
|
|
|
|
|
275 |
|
276 |
status_messages.append(f"\nπ {len(generated_audio_files)} audio file(s) generated!")
|
277 |
|
278 |
-
|
279 |
-
|
280 |
|
281 |
if merge_checkbox and len(generated_audio_files) > 1:
|
282 |
if not PYDUB_AVAILABLE:
|
283 |
status_messages.append("β οΈ pydub not available. Cannot merge. Returning ZIP of parts.")
|
284 |
-
|
285 |
-
status_messages.append(
|
286 |
-
if
|
287 |
-
output_download_path = zip_path
|
288 |
-
# No single audio player if zipped
|
289 |
else:
|
|
|
|
|
|
|
|
|
|
|
290 |
merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
|
291 |
-
|
292 |
-
status_messages.append(
|
293 |
-
if
|
294 |
-
|
295 |
-
|
296 |
if delete_partials_checkbox:
|
297 |
status_messages.append("ποΈ Deleting partial files...")
|
298 |
for file_p in generated_audio_files:
|
299 |
-
try:
|
300 |
-
|
301 |
-
|
302 |
-
except Exception as e_del:
|
303 |
-
status_messages.append(f" β οΈ Could not delete {os.path.basename(file_p)}: {e_del}")
|
304 |
-
else: # Merge failed, provide ZIP
|
305 |
status_messages.append("β οΏ½οΏ½οΏ½ Merge failed. Providing ZIP of parts.")
|
306 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
307 |
status_messages.append(msg_zip)
|
308 |
-
if success_zip:
|
309 |
-
output_download_path = zip_p
|
310 |
-
|
311 |
elif len(generated_audio_files) == 1:
|
312 |
-
|
313 |
-
|
314 |
-
|
315 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
316 |
else: # Multiple files, no merge requested
|
317 |
status_messages.append("π¦ Multiple parts generated. Creating ZIP file.")
|
318 |
-
|
319 |
-
status_messages.append(
|
320 |
-
if
|
321 |
-
output_download_path = zip_path
|
322 |
-
# No single audio player if zipped
|
323 |
|
324 |
final_status = "\n".join(status_messages)
|
325 |
print(final_status)
|
326 |
-
|
327 |
-
|
|
|
|
|
328 |
|
329 |
# --- Gradio Interface Definition ---
|
330 |
-
with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
331 |
gr.Markdown("# π΅ Gemini Text-to-Speech UI π£οΈ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
332 |
gr.Markdown(
|
333 |
"Convert text to speech using Google's Gemini API. "
|
334 |
-
"
|
|
|
335 |
)
|
336 |
|
337 |
with gr.Row():
|
338 |
-
with gr.Column(scale=
|
339 |
-
|
340 |
-
label="π Gemini API Key",
|
341 |
-
type="password",
|
342 |
-
placeholder="Enter your Gemini API Key here"
|
343 |
-
)
|
344 |
-
use_file = gr.Checkbox(label="π Use Text File Input", value=False)
|
345 |
text_file = gr.File(
|
346 |
-
label="Upload Text File
|
347 |
file_types=['.txt'],
|
348 |
visible=False # Initially hidden
|
349 |
)
|
350 |
text_to_speak = gr.Textbox(
|
351 |
-
label="π Text to Speak",
|
352 |
-
lines=
|
353 |
-
placeholder="Enter text here
|
354 |
visible=True # Initially visible
|
355 |
)
|
356 |
-
# Dynamic visibility for text input vs file input
|
357 |
use_file.change(
|
358 |
lambda x: (gr.update(visible=x), gr.update(visible=not x)),
|
359 |
[use_file],
|
360 |
[text_file, text_to_speak]
|
361 |
)
|
362 |
-
|
363 |
speech_prompt = gr.Textbox(
|
364 |
label="π£οΈ Speech Prompt (Optional)",
|
365 |
placeholder="e.g., 'As an energetic YouTuber speaking to an audience'",
|
366 |
info="Influences style, emotion, and voice characteristics."
|
367 |
)
|
368 |
-
output_filename_base = gr.Textbox(
|
369 |
-
label="πΎ Output Filename Base",
|
370 |
-
value="gemini_tts_output",
|
371 |
-
info="Base name for generated files (no extension)."
|
372 |
-
)
|
373 |
|
374 |
with gr.Column(scale=1):
|
375 |
model_name = gr.Dropdown(
|
376 |
-
MODELS,
|
377 |
-
label="π€ Model",
|
378 |
-
value=MODELS[0]
|
379 |
)
|
380 |
speaker_voice = gr.Dropdown(
|
381 |
-
SPEAKER_VOICES,
|
382 |
-
label="π€ Speaker Voice",
|
383 |
-
value="Charon"
|
384 |
)
|
385 |
temperature = gr.Slider(
|
386 |
-
minimum=0.0, maximum=
|
387 |
label="π‘οΈ Temperature",
|
388 |
-
info="Controls randomness
|
389 |
)
|
390 |
max_chunk_size = gr.Slider(
|
391 |
-
minimum=
|
392 |
label="π§© Max Characters per Chunk",
|
393 |
-
info="Text is split
|
394 |
)
|
395 |
sleep_between_requests = gr.Slider(
|
396 |
-
minimum=
|
397 |
-
label="β±οΈ Sleep Between
|
398 |
-
info="Helps manage API rate limits
|
399 |
)
|
400 |
-
|
401 |
-
|
402 |
-
|
403 |
-
|
404 |
-
|
|
|
|
|
405 |
merge_audio.change(lambda x: gr.update(visible=x), [merge_audio], [delete_partials])
|
|
|
|
|
|
|
406 |
|
407 |
|
408 |
-
submit_button = gr.Button("β¨ Generate Audio β¨", variant="primary")
|
409 |
|
410 |
with gr.Row():
|
411 |
with gr.Column(scale=1):
|
412 |
-
output_audio_player = gr.Audio(label="π§ Generated Audio Output", type="filepath")
|
413 |
with gr.Column(scale=1):
|
414 |
output_file_download = gr.File(label="π₯ Download Output File", type="filepath")
|
415 |
|
416 |
-
status_textbox = gr.Textbox(label="π Status Log", lines=10, interactive=False)
|
417 |
|
418 |
-
# Connect button to the function
|
419 |
submit_button.click(
|
420 |
fn=generate_audio_for_gradio,
|
421 |
inputs=[
|
422 |
-
|
423 |
max_chunk_size, sleep_between_requests, temperature,
|
424 |
model_name, speaker_voice, output_filename_base,
|
425 |
-
merge_audio, delete_partials
|
426 |
],
|
427 |
outputs=[output_audio_player, output_file_download, status_textbox]
|
428 |
)
|
429 |
|
430 |
gr.Markdown("---")
|
431 |
-
gr.Markdown(f"Created by aigolden - pydub available: {PYDUB_AVAILABLE}")
|
432 |
# The encoded text part:
|
433 |
-
encoded_text = "Q3JlYXRlIGJ5IDogYWlnb2xkZW4="
|
434 |
try:
|
435 |
-
decoded_text = base64.b64decode(encoded_text.encode()).decode()
|
436 |
-
gr.Markdown(f"<
|
437 |
-
except:
|
|
|
438 |
pass
|
439 |
-
|
440 |
-
# Example Usage (if needed, for testing locally)
|
441 |
gr.Examples(
|
442 |
examples=[
|
443 |
-
[
|
444 |
-
|
445 |
-
],
|
446 |
-
[
|
447 |
-
"YOUR_API_KEY_HERE", False, None, "An excited news anchor.", "Breaking news! Artificial intelligence can now generate human-like speech with incredible clarity. This opens up a world of possibilities for content creation and accessibility.", 3000, 12, 0.9, MODELS[1], "Achernar", "example_news", True, True
|
448 |
-
]
|
449 |
],
|
450 |
-
|
451 |
-
|
|
|
452 |
max_chunk_size, sleep_between_requests, temperature,
|
453 |
model_name, speaker_voice, output_filename_base,
|
454 |
merge_audio, delete_partials
|
455 |
],
|
456 |
outputs=[output_audio_player, output_file_download, status_textbox],
|
457 |
-
|
458 |
-
cache_examples=False # Set to True if inputs are static and fn is pure
|
459 |
)
|
|
|
460 |
|
461 |
|
462 |
if __name__ == "__main__":
|
463 |
if not PYDUB_AVAILABLE:
|
464 |
print("WARNING: pydub library is not installed or working. Audio file merging will be disabled.")
|
465 |
-
|
466 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
9 |
from google import genai
|
10 |
from google.genai import types
|
11 |
|
12 |
+
# Attempt to load API key from Hugging Face Secrets
|
13 |
+
# The Space's runtime will inject this environment variable if the secret is set.
|
14 |
+
HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
15 |
+
|
16 |
try:
|
17 |
from pydub import AudioSegment
|
18 |
PYDUB_AVAILABLE = True
|
19 |
except ImportError:
|
20 |
PYDUB_AVAILABLE = False
|
21 |
+
print("β οΈ pydub is not available. Audio file merging will be disabled.")
|
22 |
+
print("If merging is desired, ensure pydub is in requirements.txt and ffmpeg is available in the environment.")
|
23 |
|
24 |
+
# --- Constants ---
|
25 |
SPEAKER_VOICES = [
|
26 |
"Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
|
27 |
"Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
|
|
|
31 |
]
|
32 |
MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
|
33 |
|
34 |
+
# --- Helper functions ---
|
35 |
def save_binary_file(file_name, data):
|
|
|
36 |
abs_file_name = os.path.abspath(file_name)
|
37 |
+
try:
|
38 |
+
with open(abs_file_name, "wb") as f:
|
39 |
+
f.write(data)
|
40 |
+
print(f"β
File saved at: {abs_file_name}")
|
41 |
+
return abs_file_name
|
42 |
+
except Exception as e:
|
43 |
+
print(f"β Error saving file {abs_file_name}: {e}")
|
44 |
+
return None
|
45 |
|
46 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
47 |
parameters = parse_audio_mime_type(mime_type)
|
48 |
bits_per_sample = parameters["bits_per_sample"]
|
49 |
sample_rate = parameters["rate"]
|
50 |
+
num_channels = 1 # Gemini TTS seems to output mono
|
51 |
data_size = len(audio_data)
|
52 |
bytes_per_sample = bits_per_sample // 8
|
53 |
block_align = num_channels * bytes_per_sample
|
54 |
byte_rate = sample_rate * block_align
|
55 |
+
chunk_size = 36 + data_size # Size of the 'fmt ' and 'data' chunks and their headers
|
56 |
+
|
57 |
header = struct.pack(
|
58 |
"<4sI4s4sIHHIIHH4sI",
|
59 |
+
b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, # 16 for PCM
|
60 |
+
1, # PCM format
|
61 |
+
num_channels, sample_rate, byte_rate, block_align, bits_per_sample,
|
62 |
+
b"data", data_size
|
63 |
)
|
64 |
return header + audio_data
|
65 |
|
66 |
def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
|
67 |
+
bits_per_sample = 16 # Default
|
68 |
+
rate = 24000 # Default for Gemini TTS
|
69 |
+
if mime_type:
|
70 |
+
mime_type_lower = mime_type.lower()
|
71 |
+
parts = mime_type_lower.split(";")
|
72 |
+
for param in parts:
|
73 |
+
param = param.strip()
|
74 |
+
if param.startswith("rate="):
|
75 |
+
try:
|
76 |
+
rate_str = param.split("=", 1)[1]
|
77 |
+
rate = int(rate_str)
|
78 |
+
except (ValueError, IndexError): pass
|
79 |
+
elif param.startswith("audio/l"): # e.g., audio/L16 or audio/L24
|
80 |
+
try:
|
81 |
+
# Attempt to parse bits from "L<bits>"
|
82 |
+
potential_bits = param.split("l", 1)[1]
|
83 |
+
if potential_bits.isdigit():
|
84 |
+
bits_per_sample = int(potential_bits)
|
85 |
+
except (ValueError, IndexError): pass
|
86 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
87 |
|
88 |
def load_text_from_gr_file(file_obj):
|
|
|
89 |
if file_obj is None:
|
90 |
return "", "No file provided for text input."
|
91 |
try:
|
|
|
93 |
content = f.read().strip()
|
94 |
if not content:
|
95 |
return "", "Text file is empty."
|
|
|
96 |
return content, f"Successfully loaded {len(content)} chars from {os.path.basename(file_obj.name)}."
|
97 |
except Exception as e:
|
98 |
return "", f"Error reading text file: {e}"
|
|
|
102 |
return [text]
|
103 |
chunks = []
|
104 |
current_chunk = ""
|
105 |
+
sentences = re.split(r'(?<=[.!?])\s+', text) # Split by sentences
|
106 |
for sentence in sentences:
|
107 |
+
if not sentence: continue
|
108 |
+
# If adding the current sentence exceeds max_size
|
109 |
+
if len(current_chunk) + len(sentence) + 1 > max_size: # +1 for space
|
110 |
+
if current_chunk: # If there's something in current_chunk, add it
|
111 |
chunks.append(current_chunk.strip())
|
112 |
+
current_chunk = "" # Reset current_chunk
|
113 |
+
|
114 |
+
# If the sentence itself is too long, split it by words or even characters
|
115 |
if len(sentence) > max_size:
|
116 |
+
words = sentence.split(' ')
|
117 |
+
temp_sentence_part = ""
|
118 |
+
for word in words:
|
119 |
+
if len(temp_sentence_part) + len(word) + 1 > max_size:
|
120 |
+
if temp_sentence_part: chunks.append(temp_sentence_part.strip())
|
121 |
+
# If word itself is too long (rare for TTS practical limits)
|
122 |
+
if len(word) > max_size:
|
123 |
+
for i in range(0, len(word), max_size):
|
124 |
+
chunks.append(word[i:i+max_size])
|
125 |
+
temp_sentence_part = ""
|
126 |
+
else:
|
127 |
+
temp_sentence_part = word
|
128 |
+
else:
|
129 |
+
temp_sentence_part += (" " if temp_sentence_part else "") + word
|
130 |
+
if temp_sentence_part: chunks.append(temp_sentence_part.strip())
|
131 |
+
# current_chunk remains empty as the long sentence was fully processed
|
132 |
+
else: # Sentence is not too long itself, start a new chunk with it
|
133 |
current_chunk = sentence
|
134 |
+
else: # Sentence fits, add to current_chunk
|
135 |
current_chunk += (" " if current_chunk else "") + sentence
|
136 |
+
|
137 |
+
if current_chunk: # Add any remaining part
|
138 |
chunks.append(current_chunk.strip())
|
139 |
return chunks
|
140 |
|
|
|
142 |
def merge_audio_files_func(file_paths, output_path):
|
143 |
if not PYDUB_AVAILABLE:
|
144 |
return False, "pydub is not available. Cannot merge files.", None
|
145 |
+
if not file_paths:
|
146 |
+
return False, "No audio files to merge.", None
|
147 |
try:
|
|
|
148 |
combined = AudioSegment.empty()
|
149 |
for i, file_path in enumerate(file_paths):
|
150 |
if os.path.exists(file_path):
|
151 |
+
try:
|
152 |
+
# Explicitly state format if known, otherwise pydub tries to guess
|
153 |
+
# Assuming all inputs are WAV due to our conversion logic
|
154 |
+
audio = AudioSegment.from_file(file_path, format="wav")
|
155 |
+
combined += audio
|
156 |
+
if i < len(file_paths) - 1:
|
157 |
+
combined += AudioSegment.silent(duration=200) # Small silence
|
158 |
+
except Exception as e_load:
|
159 |
+
print(f"β οΈ Error loading audio file {file_path} with pydub: {e_load}")
|
160 |
+
return False, f"Error loading audio file {os.path.basename(file_path)}: {e_load}", None
|
161 |
else:
|
162 |
+
print(f"β οΈ File not found for merging: {file_path}")
|
163 |
+
# Decide if this is critical; for now, we'll say it is.
|
164 |
+
return False, f"File not found for merging: {os.path.basename(file_path)}", None
|
165 |
|
166 |
abs_output_path = os.path.abspath(output_path)
|
167 |
combined.export(abs_output_path, format="wav")
|
|
|
168 |
return True, f"Merged file saved: {os.path.basename(abs_output_path)}", abs_output_path
|
169 |
except Exception as e:
|
170 |
+
print(f"β Error merging files: {e}")
|
171 |
return False, f"Error merging files: {e}", None
|
172 |
|
173 |
def create_zip_file(file_paths, zip_name):
|
|
|
177 |
for file_path in file_paths:
|
178 |
if os.path.exists(file_path):
|
179 |
zipf.write(file_path, os.path.basename(file_path))
|
|
|
180 |
return True, f"ZIP file created: {os.path.basename(abs_zip_name)}", abs_zip_name
|
181 |
except Exception as e:
|
182 |
return False, f"Error creating ZIP file: {e}", None
|
183 |
|
184 |
+
# --- Main generation function (modified for Gradio & HF Secrets) ---
|
185 |
def generate_audio_for_gradio(
|
186 |
+
# api_key_input_field is removed, will use HF_GEMINI_API_KEY
|
187 |
+
use_file_input_checkbox, text_file_obj,
|
188 |
speech_prompt_input, text_to_speak_input,
|
189 |
max_chunk_slider, sleep_slider, temperature_slider,
|
190 |
model_dropdown, speaker_dropdown, output_filename_base_input,
|
191 |
+
merge_checkbox, delete_partials_checkbox,
|
192 |
+
# Progress for Gradio (optional but good for long tasks)
|
193 |
+
progress=gr.Progress(track_tqdm=True)
|
194 |
):
|
195 |
status_messages = []
|
|
|
196 |
status_messages.append("π Starting Text-to-Speech process...")
|
197 |
+
progress(0, desc="Initializing...")
|
198 |
+
|
199 |
+
# 1. API Key Validation (from HF Secrets)
|
200 |
+
api_key_to_use = HF_GEMINI_API_KEY
|
201 |
+
if not api_key_to_use:
|
202 |
+
# Fallback if user provides one in a field (though we removed the field)
|
203 |
+
# This part can be removed if you *only* want to use secrets
|
204 |
+
# For now, let's assume if HF_GEMINI_API_KEY is None, we raise an error.
|
205 |
+
status_messages.append("β Error: GEMINI_API_KEY not found in Hugging Face Secrets.")
|
206 |
+
status_messages.append("β‘οΈ Please set it in your Space's Settings > Secrets.")
|
207 |
return None, None, "\n".join(status_messages)
|
208 |
+
|
209 |
+
os.environ["GEMINI_API_KEY"] = api_key_to_use # Set for genai library
|
210 |
+
status_messages.append("π API Key loaded from Secrets.")
|
211 |
|
212 |
# 2. Determine Text Input
|
213 |
actual_text_input = ""
|
|
|
230 |
# 3. Initialize GenAI Client
|
231 |
try:
|
232 |
status_messages.append("π οΈ Initializing Gemini client...")
|
233 |
+
progress(0.1, desc="Initializing Gemini Client...")
|
234 |
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
|
235 |
status_messages.append("β
Gemini client initialized.")
|
236 |
except Exception as e:
|
|
|
240 |
# 4. Split text
|
241 |
text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
|
242 |
status_messages.append(f"π Text split into {len(text_chunks)} chunk(s).")
|
243 |
+
for i, chunk_text in enumerate(text_chunks): # Renamed 'chunk' to 'chunk_text'
|
244 |
+
status_messages.append(f" π Chunk {i+1}: {len(chunk_text)} chars")
|
245 |
|
246 |
# 5. Generate audio for each chunk
|
247 |
generated_audio_files = []
|
|
|
248 |
run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
|
249 |
temp_output_dir = f"temp_audio_{run_id}"
|
250 |
os.makedirs(temp_output_dir, exist_ok=True)
|
251 |
+
output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input) # More robust sanitize
|
|
|
252 |
|
253 |
+
total_chunks = len(text_chunks)
|
254 |
+
for i, chunk_text_content in enumerate(text_chunks):
|
255 |
+
progress_val = 0.1 + (0.7 * (i / total_chunks)) # Progress from 10% to 80% during generation
|
256 |
+
progress(progress_val, desc=f"Generating chunk {i+1}/{total_chunks}...")
|
257 |
+
|
258 |
+
status_messages.append(f"\nπ Generating audio for chunk {i+1}/{total_chunks}...")
|
259 |
+
final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
|
260 |
+
|
261 |
+
contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
|
262 |
generate_content_config = types.GenerateContentConfig(
|
263 |
temperature=float(temperature_slider),
|
264 |
response_modalities=["audio"],
|
|
|
270 |
)
|
271 |
try:
|
272 |
chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
|
|
|
273 |
chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
|
274 |
+
|
275 |
+
audio_data_received = False
|
276 |
+
for stream_response_chunk in client.models.generate_content_stream(
|
277 |
+
model=model_dropdown, contents=contents_for_api, config=generate_content_config,
|
278 |
):
|
279 |
+
if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
|
280 |
+
stream_response_chunk.candidates[0].content.parts and
|
281 |
+
stream_response_chunk.candidates[0].content.parts[0].inline_data):
|
282 |
|
283 |
+
inline_data = stream_response_chunk.candidates[0].content.parts[0].inline_data
|
284 |
data_buffer = inline_data.data
|
285 |
+
api_mime_type = inline_data.mime_type
|
286 |
+
audio_data_received = True
|
287 |
+
|
288 |
+
status_messages.append(f"βΉοΈ API returned MIME type: {api_mime_type}")
|
289 |
+
|
290 |
+
# Determine file extension and convert if necessary
|
291 |
+
file_extension = ".wav" # Default to .wav and convert
|
292 |
+
if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
|
293 |
+
file_extension = ".mp3"
|
294 |
+
# For MP3, data_buffer is already MP3. No conversion needed for saving.
|
295 |
+
# pydub will need ffmpeg to read MP3 for merging.
|
296 |
+
status_messages.append(f"βΉοΈ Saving as MP3 based on MIME: {api_mime_type}")
|
297 |
+
elif api_mime_type and "wav" in api_mime_type.lower() and \
|
298 |
+
not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
|
299 |
+
file_extension = ".wav"
|
300 |
+
# API says WAV and it's not raw L16/L24, trust it.
|
301 |
+
status_messages.append(f"βΉοΈ Saving as WAV based on MIME: {api_mime_type}")
|
302 |
+
else: # Raw PCM (like audio/L16), unknown, or .bin -> convert to WAV
|
303 |
+
file_extension = ".wav"
|
304 |
+
status_messages.append(f"βΉοΈ Converting to WAV for MIME: {api_mime_type or 'Unknown'}")
|
305 |
+
data_buffer = convert_to_wav(data_buffer, api_mime_type)
|
306 |
+
|
307 |
+
status_messages.append(f"βΉοΈ Determined file extension: {file_extension}")
|
308 |
+
|
309 |
generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
|
310 |
+
if generated_file_path:
|
311 |
+
generated_audio_files.append(generated_file_path)
|
312 |
+
status_messages.append(f"β
Chunk {i+1} saved: {os.path.basename(generated_file_path)}")
|
313 |
+
else:
|
314 |
+
status_messages.append(f"β Failed to save chunk {i+1}.")
|
315 |
+
break # Processed this audio data from stream
|
316 |
+
|
317 |
+
elif stream_response_chunk.text:
|
318 |
+
status_messages.append(f"βΉοΈ API Text Message (during stream): {stream_response_chunk.text}")
|
319 |
+
|
320 |
+
if not audio_data_received:
|
321 |
+
status_messages.append(f"β No audio data received in stream for chunk {i+1}.")
|
322 |
+
# Check for errors in the stream response if available
|
323 |
+
if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
|
324 |
+
status_messages.append(f"π API Block Reason: {stream_response_chunk.prompt_feedback.block_reason_message or stream_response_chunk.prompt_feedback.block_reason}")
|
325 |
+
|
326 |
+
|
327 |
+
except types.BlockedPromptException as bpe:
|
328 |
+
status_messages.append(f"β Content blocked for chunk {i+1}: {bpe}")
|
329 |
+
status_messages.append(f" Feedback: {bpe.response.prompt_feedback}")
|
330 |
+
except types.StopCandidateException as sce:
|
331 |
+
status_messages.append(f"β Generation stopped for chunk {i+1}: {sce}")
|
332 |
+
status_messages.append(f" Feedback: {sce.response.prompt_feedback}")
|
333 |
except Exception as e:
|
334 |
+
status_messages.append(f"β Error generating/processing chunk {i+1}: {e}")
|
335 |
+
import traceback
|
336 |
+
status_messages.append(traceback.format_exc()) # More detailed error
|
337 |
+
continue
|
338 |
|
339 |
+
if i < total_chunks - 1:
|
340 |
status_messages.append(f"β±οΈ Waiting {sleep_slider}s...")
|
341 |
time.sleep(float(sleep_slider))
|
342 |
|
343 |
+
progress(0.85, desc="Processing generated files...")
|
344 |
# 6. Handle output files
|
345 |
if not generated_audio_files:
|
346 |
+
status_messages.append("β No audio files were successfully generated or saved!")
|
347 |
+
final_status = "\n".join(status_messages)
|
348 |
+
print(final_status)
|
349 |
+
progress(1, desc="Finished with errors.")
|
350 |
+
return None, None, final_status
|
351 |
|
352 |
status_messages.append(f"\nπ {len(generated_audio_files)} audio file(s) generated!")
|
353 |
|
354 |
+
output_audio_path_for_player = None # For gr.Audio, ideally a single WAV
|
355 |
+
output_path_for_download = None # For gr.File, can be WAV or ZIP
|
356 |
|
357 |
if merge_checkbox and len(generated_audio_files) > 1:
|
358 |
if not PYDUB_AVAILABLE:
|
359 |
status_messages.append("β οΈ pydub not available. Cannot merge. Returning ZIP of parts.")
|
360 |
+
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
361 |
+
status_messages.append(msg_zip)
|
362 |
+
if success_zip: output_path_for_download = zip_p
|
|
|
|
|
363 |
else:
|
364 |
+
status_messages.append(f"π Merging {len(generated_audio_files)} files (all should be WAVs now)...")
|
365 |
+
# Ensure all files for merging are WAV, convert if any MP3s were saved and pydub is used
|
366 |
+
# For simplicity, our save logic now tries to make them WAV if not MP3 from API.
|
367 |
+
# If an MP3 was saved and PYDUB_AVAILABLE, it should handle it.
|
368 |
+
|
369 |
merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
|
370 |
+
success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
|
371 |
+
status_messages.append(msg_merge)
|
372 |
+
if success_merge:
|
373 |
+
output_audio_path_for_player = merged_p
|
374 |
+
output_path_for_download = merged_p
|
375 |
if delete_partials_checkbox:
|
376 |
status_messages.append("ποΈ Deleting partial files...")
|
377 |
for file_p in generated_audio_files:
|
378 |
+
try: os.remove(file_p); status_messages.append(f" ποΈ Deleted: {os.path.basename(file_p)}")
|
379 |
+
except Exception as e_del: status_messages.append(f" β οΈ Could not delete {os.path.basename(file_p)}: {e_del}")
|
380 |
+
else:
|
|
|
|
|
|
|
381 |
status_messages.append("β οΏ½οΏ½οΏ½ Merge failed. Providing ZIP of parts.")
|
382 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
383 |
status_messages.append(msg_zip)
|
384 |
+
if success_zip: output_path_for_download = zip_p
|
|
|
|
|
385 |
elif len(generated_audio_files) == 1:
|
386 |
+
# Single file, should be WAV due to our conversion logic or MP3 if API sent that
|
387 |
+
single_file_path = generated_audio_files[0]
|
388 |
+
if single_file_path.lower().endswith(".mp3") and PYDUB_AVAILABLE:
|
389 |
+
# Convert MP3 to WAV for Gradio player if it prefers WAV
|
390 |
+
# Or, gr.Audio might handle MP3 directly. Let's test.
|
391 |
+
# For now, assume gr.Audio handles common types.
|
392 |
+
output_audio_path_for_player = single_file_path
|
393 |
+
status_messages.append(f"π΅ Single MP3 file: {os.path.basename(single_file_path)}")
|
394 |
+
else: # Assume WAV
|
395 |
+
output_audio_path_for_player = single_file_path
|
396 |
+
status_messages.append(f"π΅ Single WAV file: {os.path.basename(single_file_path)}")
|
397 |
+
output_path_for_download = single_file_path
|
398 |
else: # Multiple files, no merge requested
|
399 |
status_messages.append("π¦ Multiple parts generated. Creating ZIP file.")
|
400 |
+
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
401 |
+
status_messages.append(msg_zip)
|
402 |
+
if success_zip: output_path_for_download = zip_p
|
|
|
|
|
403 |
|
404 |
final_status = "\n".join(status_messages)
|
405 |
print(final_status)
|
406 |
+
print(f"DEBUG: output_audio_path_for_player: {output_audio_path_for_player}")
|
407 |
+
print(f"DEBUG: output_path_for_download: {output_path_for_download}")
|
408 |
+
progress(1, desc="Finished!")
|
409 |
+
return output_audio_path_for_player, output_path_for_download, final_status
|
410 |
|
411 |
# --- Gradio Interface Definition ---
|
412 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
|
413 |
gr.Markdown("# π΅ Gemini Text-to-Speech UI π£οΈ")
|
414 |
+
if not HF_GEMINI_API_KEY:
|
415 |
+
gr.Warning(
|
416 |
+
"GEMINI_API_KEY not found in Hugging Face Secrets. "
|
417 |
+
"Please add it in your Space's 'Settings' > 'Secrets' tab for the app to work. "
|
418 |
+
"Name the secret `GEMINI_API_KEY`."
|
419 |
+
)
|
420 |
+
else:
|
421 |
+
gr.Info("Gemini API Key loaded successfully from Space Secrets. Ready to generate!")
|
422 |
+
|
423 |
gr.Markdown(
|
424 |
"Convert text to speech using Google's Gemini API. "
|
425 |
+
"Your Gemini API Key must be set as a Secret named `GEMINI_API_KEY` in this Space's settings."
|
426 |
+
"\n\nGet your API Key from [Google AI Studio](https://aistudio.google.com/app/apikey)."
|
427 |
)
|
428 |
|
429 |
with gr.Row():
|
430 |
+
with gr.Column(scale=2): # Wider column for text inputs
|
431 |
+
use_file = gr.Checkbox(label="π Use Text File Input (.txt)", value=False)
|
|
|
|
|
|
|
|
|
|
|
432 |
text_file = gr.File(
|
433 |
+
label="Upload Text File", # Simpler label
|
434 |
file_types=['.txt'],
|
435 |
visible=False # Initially hidden
|
436 |
)
|
437 |
text_to_speak = gr.Textbox(
|
438 |
+
label="π Text to Speak (or use file above)",
|
439 |
+
lines=10,
|
440 |
+
placeholder="Enter text here...",
|
441 |
visible=True # Initially visible
|
442 |
)
|
|
|
443 |
use_file.change(
|
444 |
lambda x: (gr.update(visible=x), gr.update(visible=not x)),
|
445 |
[use_file],
|
446 |
[text_file, text_to_speak]
|
447 |
)
|
|
|
448 |
speech_prompt = gr.Textbox(
|
449 |
label="π£οΈ Speech Prompt (Optional)",
|
450 |
placeholder="e.g., 'As an energetic YouTuber speaking to an audience'",
|
451 |
info="Influences style, emotion, and voice characteristics."
|
452 |
)
|
|
|
|
|
|
|
|
|
|
|
453 |
|
454 |
with gr.Column(scale=1):
|
455 |
model_name = gr.Dropdown(
|
456 |
+
MODELS, label="π€ Model", value=MODELS[0]
|
|
|
|
|
457 |
)
|
458 |
speaker_voice = gr.Dropdown(
|
459 |
+
SPEAKER_VOICES, label="π€ Speaker Voice", value="Charon"
|
|
|
|
|
460 |
)
|
461 |
temperature = gr.Slider(
|
462 |
+
minimum=0.0, maximum=1.0, step=0.05, value=0.7, # Gemini TTS often uses temp <= 1
|
463 |
label="π‘οΈ Temperature",
|
464 |
+
info="Controls randomness (0.0-1.0). Higher for more variation."
|
465 |
)
|
466 |
max_chunk_size = gr.Slider(
|
467 |
+
minimum=1000, maximum=4000, step=100, value=3800,
|
468 |
label="π§© Max Characters per Chunk",
|
469 |
+
info="Text is split for API. Max 4096 per request for some models."
|
470 |
)
|
471 |
sleep_between_requests = gr.Slider(
|
472 |
+
minimum=1, maximum=15, step=0.5, value=2, # Reduced default sleep
|
473 |
+
label="β±οΈ Sleep Between Chunks (sec)",
|
474 |
+
info="Helps manage API rate limits (e.g. Gemini Flash has 60 RPM limit)."
|
475 |
)
|
476 |
+
output_filename_base = gr.Textbox(
|
477 |
+
label="πΎ Output Filename Base", value="gemini_tts_audio"
|
478 |
+
)
|
479 |
+
|
480 |
+
with gr.Group(visible=PYDUB_AVAILABLE):
|
481 |
+
merge_audio = gr.Checkbox(label="π Merge Audio Chunks (if >1)", value=True)
|
482 |
+
delete_partials = gr.Checkbox(label="ποΈ Delete Chunks After Merge", value=True, visible=True) # Default visible
|
483 |
merge_audio.change(lambda x: gr.update(visible=x), [merge_audio], [delete_partials])
|
484 |
+
|
485 |
+
if not PYDUB_AVAILABLE:
|
486 |
+
gr.Markdown("<small>β οΈ Merging disabled: `pydub` library not found. Install if needed.</small>")
|
487 |
|
488 |
|
489 |
+
submit_button = gr.Button("β¨ Generate Audio β¨", variant="primary", scale=2) # Centered button
|
490 |
|
491 |
with gr.Row():
|
492 |
with gr.Column(scale=1):
|
493 |
+
output_audio_player = gr.Audio(label="π§ Generated Audio Output", type="filepath", format="wav") # Specify format if known
|
494 |
with gr.Column(scale=1):
|
495 |
output_file_download = gr.File(label="π₯ Download Output File", type="filepath")
|
496 |
|
497 |
+
status_textbox = gr.Textbox(label="π Status Log", lines=10, interactive=False, max_lines=20)
|
498 |
|
|
|
499 |
submit_button.click(
|
500 |
fn=generate_audio_for_gradio,
|
501 |
inputs=[
|
502 |
+
use_file, text_file, speech_prompt, text_to_speak,
|
503 |
max_chunk_size, sleep_between_requests, temperature,
|
504 |
model_name, speaker_voice, output_filename_base,
|
505 |
+
merge_audio, delete_partials # Even if not visible, pass them
|
506 |
],
|
507 |
outputs=[output_audio_player, output_file_download, status_textbox]
|
508 |
)
|
509 |
|
510 |
gr.Markdown("---")
|
|
|
511 |
# The encoded text part:
|
512 |
+
encoded_text = "Q3JlYXRlIGJ5IDogYWlnb2xkZW4=" # "Created by : aigolden"
|
513 |
try:
|
514 |
+
decoded_text = base64.b64decode(encoded_text.encode('utf-8')).decode('utf-8')
|
515 |
+
gr.Markdown(f"<p style='text-align:center; font-size:small;'><em>{decoded_text}</em></p>")
|
516 |
+
except Exception as e_decode:
|
517 |
+
print(f"Error decoding/displaying credit: {e_decode}")
|
518 |
pass
|
519 |
+
|
|
|
520 |
gr.Examples(
|
521 |
examples=[
|
522 |
+
[False, None, "A friendly and informative narrator.", "Hello world, this is a test of the Gemini text to speech API using Gradio. I hope this works well!", 3800, 2, 0.7, MODELS[0], "Charon", "example_hello", True, True],
|
523 |
+
[False, None, "An excited news reporter.", "Breaking news! Artificial intelligence can now generate human-like speech. This technology is rapidly evolving!", 3000, 2, 0.8, MODELS[1], "Achernar", "example_news", True, True],
|
524 |
+
[True, "sample_text.txt", "A calm storyteller.", "", 3500, 3, 0.6, MODELS[0], "Vindemiatrix", "example_from_file", True, False]
|
|
|
|
|
|
|
525 |
],
|
526 |
+
fn=generate_audio_for_gradio, # Ensure example fn is the same as main
|
527 |
+
inputs=[ # Ensure these match the function's inputs exactly (order and number)
|
528 |
+
use_file, text_file, speech_prompt, text_to_speak,
|
529 |
max_chunk_size, sleep_between_requests, temperature,
|
530 |
model_name, speaker_voice, output_filename_base,
|
531 |
merge_audio, delete_partials
|
532 |
],
|
533 |
outputs=[output_audio_player, output_file_download, status_textbox],
|
534 |
+
cache_examples=False # API calls, so don't cache results based on static inputs
|
|
|
535 |
)
|
536 |
+
gr.Markdown("<small>To use the 'example_from_file', please create a `sample_text.txt` file in the root of this Space with some text content, or upload your own text file.</small>")
|
537 |
|
538 |
|
539 |
if __name__ == "__main__":
|
540 |
if not PYDUB_AVAILABLE:
|
541 |
print("WARNING: pydub library is not installed or working. Audio file merging will be disabled.")
|
542 |
+
if not HF_GEMINI_API_KEY:
|
543 |
+
print("WARNING: GEMINI_API_KEY environment variable not set. The app might not work in local if it relies on this for API key.")
|
544 |
+
|
545 |
+
# For local testing, you might want to provide a way to input the API key
|
546 |
+
# or set the GEMINI_API_KEY environment variable before running.
|
547 |
+
# e.g., export GEMINI_API_KEY="your_key_here"
|
548 |
+
# then run python app.py
|
549 |
+
|
550 |
+
demo.launch(debug=True, share=False) # share=False for local, HF Spaces handles public link
|