Hamed744 commited on
Commit
1cae8c0
·
verified ·
1 Parent(s): 3f42665

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +236 -284
app.py CHANGED
@@ -8,9 +8,9 @@ import time
8
  import zipfile
9
  from google import genai
10
  from google.genai import types
 
11
 
12
- # Attempt to load API key from Hugging Face Secrets
13
- # The Space's runtime will inject this environment variable if the secret is set.
14
  HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
15
 
16
  try:
@@ -18,10 +18,9 @@ try:
18
  PYDUB_AVAILABLE = True
19
  except ImportError:
20
  PYDUB_AVAILABLE = False
21
- print("⚠️ pydub is not available. Audio file merging will be disabled.")
22
- print("If merging is desired, ensure pydub is in requirements.txt and ffmpeg is available in the environment.")
23
 
24
- # --- Constants ---
25
  SPEAKER_VOICES = [
26
  "Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
27
  "Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
@@ -30,145 +29,128 @@ SPEAKER_VOICES = [
30
  "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
31
  ]
32
  MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
33
-
34
- # --- Helper functions ---
 
 
 
 
 
 
 
 
 
 
 
35
  def save_binary_file(file_name, data):
36
  abs_file_name = os.path.abspath(file_name)
37
  try:
38
  with open(abs_file_name, "wb") as f:
39
  f.write(data)
40
- print(f"✅ File saved at: {abs_file_name}")
41
  return abs_file_name
42
  except Exception as e:
43
- print(f"❌ Error saving file {abs_file_name}: {e}")
44
  return None
45
 
46
  def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
47
  parameters = parse_audio_mime_type(mime_type)
48
  bits_per_sample = parameters["bits_per_sample"]
49
  sample_rate = parameters["rate"]
50
- num_channels = 1 # Gemini TTS seems to output mono
51
  data_size = len(audio_data)
52
  bytes_per_sample = bits_per_sample // 8
53
  block_align = num_channels * bytes_per_sample
54
  byte_rate = sample_rate * block_align
55
- chunk_size = 36 + data_size # Size of the 'fmt ' and 'data' chunks and their headers
56
-
57
  header = struct.pack(
58
  "<4sI4s4sIHHIIHH4sI",
59
- b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, # 16 for PCM
60
- 1, # PCM format
61
- num_channels, sample_rate, byte_rate, block_align, bits_per_sample,
62
- b"data", data_size
63
  )
64
  return header + audio_data
65
 
66
  def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
67
- bits_per_sample = 16 # Default
68
- rate = 24000 # Default for Gemini TTS
69
  if mime_type:
70
  mime_type_lower = mime_type.lower()
71
  parts = mime_type_lower.split(";")
72
  for param in parts:
73
  param = param.strip()
74
  if param.startswith("rate="):
 
 
 
75
  try:
76
- rate_str = param.split("=", 1)[1]
77
- rate = int(rate_str)
78
- except (ValueError, IndexError): pass
79
- elif param.startswith("audio/l"): # e.g., audio/L16 or audio/L24
80
- try:
81
- # Attempt to parse bits from "L<bits>"
82
  potential_bits = param.split("l", 1)[1]
83
- if potential_bits.isdigit():
84
- bits_per_sample = int(potential_bits)
85
- except (ValueError, IndexError): pass
86
  return {"bits_per_sample": bits_per_sample, "rate": rate}
87
 
88
  def load_text_from_gr_file(file_obj):
89
  if file_obj is None:
90
- return "", "No file provided for text input."
91
  try:
92
  with open(file_obj.name, 'r', encoding='utf-8') as f:
93
  content = f.read().strip()
94
  if not content:
95
- return "", "Text file is empty."
96
- return content, f"Successfully loaded {len(content)} chars from {os.path.basename(file_obj.name)}."
97
  except Exception as e:
98
- return "", f"Error reading text file: {e}"
99
 
100
  def smart_text_split(text, max_size=3800):
101
- if len(text) <= max_size:
102
- return [text]
103
- chunks = []
104
- current_chunk = ""
105
- sentences = re.split(r'(?<=[.!?])\s+', text) # Split by sentences
106
  for sentence in sentences:
107
  if not sentence: continue
108
- # If adding the current sentence exceeds max_size
109
- if len(current_chunk) + len(sentence) + 1 > max_size: # +1 for space
110
- if current_chunk: # If there's something in current_chunk, add it
111
- chunks.append(current_chunk.strip())
112
- current_chunk = "" # Reset current_chunk
113
-
114
- # If the sentence itself is too long, split it by words or even characters
115
  if len(sentence) > max_size:
116
- words = sentence.split(' ')
117
- temp_sentence_part = ""
118
  for word in words:
119
- if len(temp_sentence_part) + len(word) + 1 > max_size:
120
- if temp_sentence_part: chunks.append(temp_sentence_part.strip())
121
- # If word itself is too long (rare for TTS practical limits)
122
  if len(word) > max_size:
123
- for i in range(0, len(word), max_size):
124
- chunks.append(word[i:i+max_size])
125
- temp_sentence_part = ""
126
- else:
127
- temp_sentence_part = word
128
- else:
129
- temp_sentence_part += (" " if temp_sentence_part else "") + word
130
- if temp_sentence_part: chunks.append(temp_sentence_part.strip())
131
- # current_chunk remains empty as the long sentence was fully processed
132
- else: # Sentence is not too long itself, start a new chunk with it
133
- current_chunk = sentence
134
- else: # Sentence fits, add to current_chunk
135
- current_chunk += (" " if current_chunk else "") + sentence
136
-
137
- if current_chunk: # Add any remaining part
138
- chunks.append(current_chunk.strip())
139
  return chunks
140
 
141
-
142
  def merge_audio_files_func(file_paths, output_path):
143
- if not PYDUB_AVAILABLE:
144
- return False, "pydub is not available. Cannot merge files.", None
145
- if not file_paths:
146
- return False, "No audio files to merge.", None
147
  try:
148
  combined = AudioSegment.empty()
149
  for i, file_path in enumerate(file_paths):
150
  if os.path.exists(file_path):
151
  try:
152
- # Explicitly state format if known, otherwise pydub tries to guess
153
- # Assuming all inputs are WAV due to our conversion logic
154
- audio = AudioSegment.from_file(file_path, format="wav")
155
  combined += audio
156
- if i < len(file_paths) - 1:
157
- combined += AudioSegment.silent(duration=200) # Small silence
158
  except Exception as e_load:
159
- print(f"⚠️ Error loading audio file {file_path} with pydub: {e_load}")
160
- return False, f"Error loading audio file {os.path.basename(file_path)}: {e_load}", None
 
161
  else:
162
- print(f"⚠️ File not found for merging: {file_path}")
163
- # Decide if this is critical; for now, we'll say it is.
164
- return False, f"File not found for merging: {os.path.basename(file_path)}", None
165
-
166
  abs_output_path = os.path.abspath(output_path)
167
  combined.export(abs_output_path, format="wav")
168
- return True, f"Merged file saved: {os.path.basename(abs_output_path)}", abs_output_path
169
  except Exception as e:
170
- print(f" Error merging files: {e}")
171
- return False, f"Error merging files: {e}", None
 
172
 
173
  def create_zip_file(file_paths, zip_name):
174
  abs_zip_name = os.path.abspath(zip_name)
@@ -177,85 +159,75 @@ def create_zip_file(file_paths, zip_name):
177
  for file_path in file_paths:
178
  if os.path.exists(file_path):
179
  zipf.write(file_path, os.path.basename(file_path))
180
- return True, f"ZIP file created: {os.path.basename(abs_zip_name)}", abs_zip_name
181
  except Exception as e:
182
- return False, f"Error creating ZIP file: {e}", None
183
 
184
- # --- Main generation function (modified for Gradio & HF Secrets) ---
185
  def generate_audio_for_gradio(
186
- # api_key_input_field is removed, will use HF_GEMINI_API_KEY
187
  use_file_input_checkbox, text_file_obj,
188
  speech_prompt_input, text_to_speak_input,
189
  max_chunk_slider, sleep_slider, temperature_slider,
190
- model_dropdown, speaker_dropdown, output_filename_base_input,
 
191
  merge_checkbox, delete_partials_checkbox,
192
- # Progress for Gradio (optional but good for long tasks)
193
  progress=gr.Progress(track_tqdm=True)
194
  ):
195
  status_messages = []
196
- status_messages.append("🚀 Starting Text-to-Speech process...")
197
- progress(0, desc="Initializing...")
198
 
199
- # 1. API Key Validation (from HF Secrets)
200
  api_key_to_use = HF_GEMINI_API_KEY
201
  if not api_key_to_use:
202
- # Fallback if user provides one in a field (though we removed the field)
203
- # This part can be removed if you *only* want to use secrets
204
- # For now, let's assume if HF_GEMINI_API_KEY is None, we raise an error.
205
- status_messages.append("❌ Error: GEMINI_API_KEY not found in Hugging Face Secrets.")
206
- status_messages.append("➡️ Please set it in your Space's Settings > Secrets.")
207
  return None, None, "\n".join(status_messages)
208
 
209
- os.environ["GEMINI_API_KEY"] = api_key_to_use # Set for genai library
210
- status_messages.append("🔑 API Key loaded from Secrets.")
211
 
212
- # 2. Determine Text Input
213
  actual_text_input = ""
214
  if use_file_input_checkbox:
215
  if text_file_obj is None:
216
- status_messages.append("❌ Error: 'Use Text File' is checked, but no file was uploaded.")
217
  return None, None, "\n".join(status_messages)
218
  actual_text_input, msg = load_text_from_gr_file(text_file_obj)
219
  status_messages.append(msg)
220
- if not actual_text_input:
221
- return None, None, "\n".join(status_messages)
222
  else:
223
  actual_text_input = text_to_speak_input
224
- status_messages.append("⌨️ Using manually entered text.")
225
 
226
  if not actual_text_input or actual_text_input.strip() == "":
227
- status_messages.append("❌ Error: Text input is empty.")
228
  return None, None, "\n".join(status_messages)
229
 
230
- # 3. Initialize GenAI Client
231
  try:
232
- status_messages.append("🛠️ Initializing Gemini client...")
233
- progress(0.1, desc="Initializing Gemini Client...")
234
  client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
235
- status_messages.append("✅ Gemini client initialized.")
236
  except Exception as e:
237
- status_messages.append(f"❌ Error initializing Gemini client: {e}")
238
  return None, None, "\n".join(status_messages)
239
 
240
- # 4. Split text
241
  text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
242
- status_messages.append(f"📊 Text split into {len(text_chunks)} chunk(s).")
243
- for i, chunk_text in enumerate(text_chunks): # Renamed 'chunk' to 'chunk_text'
244
- status_messages.append(f" 📝 Chunk {i+1}: {len(chunk_text)} chars")
245
 
246
- # 5. Generate audio for each chunk
247
  generated_audio_files = []
248
  run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
249
  temp_output_dir = f"temp_audio_{run_id}"
250
  os.makedirs(temp_output_dir, exist_ok=True)
251
- output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input) # More robust sanitize
252
 
253
  total_chunks = len(text_chunks)
254
  for i, chunk_text_content in enumerate(text_chunks):
255
- progress_val = 0.1 + (0.7 * (i / total_chunks)) # Progress from 10% to 80% during generation
256
- progress(progress_val, desc=f"Generating chunk {i+1}/{total_chunks}...")
257
 
258
- status_messages.append(f"\n🔊 Generating audio for chunk {i+1}/{total_chunks}...")
259
  final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
260
 
261
  contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
@@ -264,17 +236,16 @@ def generate_audio_for_gradio(
264
  response_modalities=["audio"],
265
  speech_config=types.SpeechConfig(
266
  voice_config=types.VoiceConfig(
267
- prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=speaker_dropdown)
268
  )
269
  )
270
  )
271
  try:
272
  chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
273
  chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
274
-
275
  audio_data_received = False
276
  for stream_response_chunk in client.models.generate_content_stream(
277
- model=model_dropdown, contents=contents_for_api, config=generate_content_config,
278
  ):
279
  if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
280
  stream_response_chunk.candidates[0].content.parts and
@@ -284,161 +255,136 @@ def generate_audio_for_gradio(
284
  data_buffer = inline_data.data
285
  api_mime_type = inline_data.mime_type
286
  audio_data_received = True
287
-
288
- status_messages.append(f"ℹ️ API returned MIME type: {api_mime_type}")
289
 
290
- # Determine file extension and convert if necessary
291
- file_extension = ".wav" # Default to .wav and convert
292
  if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
293
  file_extension = ".mp3"
294
- # For MP3, data_buffer is already MP3. No conversion needed for saving.
295
- # pydub will need ffmpeg to read MP3 for merging.
296
- status_messages.append(f"ℹ️ Saving as MP3 based on MIME: {api_mime_type}")
297
  elif api_mime_type and "wav" in api_mime_type.lower() and \
298
  not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
299
  file_extension = ".wav"
300
- # API says WAV and it's not raw L16/L24, trust it.
301
- status_messages.append(f"ℹ️ Saving as WAV based on MIME: {api_mime_type}")
302
- else: # Raw PCM (like audio/L16), unknown, or .bin -> convert to WAV
303
  file_extension = ".wav"
304
- status_messages.append(f"ℹ️ Converting to WAV for MIME: {api_mime_type or 'Unknown'}")
305
  data_buffer = convert_to_wav(data_buffer, api_mime_type)
306
 
307
- status_messages.append(f"ℹ️ Determined file extension: {file_extension}")
308
 
309
  generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
310
  if generated_file_path:
311
  generated_audio_files.append(generated_file_path)
312
- status_messages.append(f"✅ Chunk {i+1} saved: {os.path.basename(generated_file_path)}")
313
  else:
314
- status_messages.append(f"❌ Failed to save chunk {i+1}.")
315
- break # Processed this audio data from stream
316
-
317
  elif stream_response_chunk.text:
318
- status_messages.append(f"ℹ️ API Text Message (during stream): {stream_response_chunk.text}")
319
 
320
  if not audio_data_received:
321
- status_messages.append(f"❌ No audio data received in stream for chunk {i+1}.")
322
- # Check for errors in the stream response if available
323
  if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
324
- status_messages.append(f"🛑 API Block Reason: {stream_response_chunk.prompt_feedback.block_reason_message or stream_response_chunk.prompt_feedback.block_reason}")
325
-
326
-
327
  except types.BlockedPromptException as bpe:
328
- status_messages.append(f"❌ Content blocked for chunk {i+1}: {bpe}")
329
- status_messages.append(f" Feedback: {bpe.response.prompt_feedback}")
330
  except types.StopCandidateException as sce:
331
- status_messages.append(f"❌ Generation stopped for chunk {i+1}: {sce}")
332
- status_messages.append(f" Feedback: {sce.response.prompt_feedback}")
333
  except Exception as e:
334
- status_messages.append(f"❌ Error generating/processing chunk {i+1}: {e}")
335
- import traceback
336
- status_messages.append(traceback.format_exc()) # More detailed error
337
- continue
338
 
339
- if i < total_chunks - 1:
340
- status_messages.append(f"⏱️ Waiting {sleep_slider}s...")
341
  time.sleep(float(sleep_slider))
342
 
343
- progress(0.85, desc="Processing generated files...")
344
- # 6. Handle output files
345
  if not generated_audio_files:
346
- status_messages.append("❌ No audio files were successfully generated or saved!")
347
  final_status = "\n".join(status_messages)
348
  print(final_status)
349
- progress(1, desc="Finished with errors.")
350
  return None, None, final_status
351
 
352
- status_messages.append(f"\n🎉 {len(generated_audio_files)} audio file(s) generated!")
353
-
354
- output_audio_path_for_player = None # For gr.Audio, ideally a single WAV
355
- output_path_for_download = None # For gr.File, can be WAV or ZIP
356
-
357
- if merge_checkbox and len(generated_audio_files) > 1:
358
- if not PYDUB_AVAILABLE:
359
- status_messages.append("⚠️ pydub not available. Cannot merge. Returning ZIP of parts.")
 
 
 
 
 
 
 
 
 
 
 
 
360
  success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
361
  status_messages.append(msg_zip)
362
  if success_zip: output_path_for_download = zip_p
363
- else:
364
- status_messages.append(f"🔗 Merging {len(generated_audio_files)} files (all should be WAVs now)...")
365
- # Ensure all files for merging are WAV, convert if any MP3s were saved and pydub is used
366
- # For simplicity, our save logic now tries to make them WAV if not MP3 from API.
367
- # If an MP3 was saved and PYDUB_AVAILABLE, it should handle it.
368
-
369
- merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
370
- success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
371
- status_messages.append(msg_merge)
372
- if success_merge:
373
- output_audio_path_for_player = merged_p
374
- output_path_for_download = merged_p
375
- if delete_partials_checkbox:
376
- status_messages.append("🗑️ Deleting partial files...")
377
- for file_p in generated_audio_files:
378
- try: os.remove(file_p); status_messages.append(f" 🗑️ Deleted: {os.path.basename(file_p)}")
379
- except Exception as e_del: status_messages.append(f" ⚠️ Could not delete {os.path.basename(file_p)}: {e_del}")
380
- else:
381
- status_messages.append("⚠️ Merge failed. Providing ZIP of parts.")
382
- success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
383
- status_messages.append(msg_zip)
384
- if success_zip: output_path_for_download = zip_p
385
  elif len(generated_audio_files) == 1:
386
- # Single file, should be WAV due to our conversion logic or MP3 if API sent that
387
  single_file_path = generated_audio_files[0]
388
- if single_file_path.lower().endswith(".mp3") and PYDUB_AVAILABLE:
389
- # Convert MP3 to WAV for Gradio player if it prefers WAV
390
- # Or, gr.Audio might handle MP3 directly. Let's test.
391
- # For now, assume gr.Audio handles common types.
392
- output_audio_path_for_player = single_file_path
393
- status_messages.append(f"🎵 Single MP3 file: {os.path.basename(single_file_path)}")
394
- else: # Assume WAV
395
- output_audio_path_for_player = single_file_path
396
- status_messages.append(f"🎵 Single WAV file: {os.path.basename(single_file_path)}")
397
  output_path_for_download = single_file_path
398
- else: # Multiple files, no merge requested
399
- status_messages.append("📦 Multiple parts generated. Creating ZIP file.")
 
 
 
400
  success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
401
  status_messages.append(msg_zip)
402
  if success_zip: output_path_for_download = zip_p
403
-
404
  final_status = "\n".join(status_messages)
405
  print(final_status)
406
- print(f"DEBUG: output_audio_path_for_player: {output_audio_path_for_player}")
407
- print(f"DEBUG: output_path_for_download: {output_path_for_download}")
408
- progress(1, desc="Finished!")
409
  return output_audio_path_for_player, output_path_for_download, final_status
410
 
411
- # --- Gradio Interface Definition ---
412
- with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
413
- gr.Markdown("# 🎵 Gemini Text-to-Speech UI 🗣️")
 
414
  if not HF_GEMINI_API_KEY:
415
  gr.Warning(
416
- "GEMINI_API_KEY not found in Hugging Face Secrets. "
417
- "Please add it in your Space's 'Settings' > 'Secrets' tab for the app to work. "
418
- "Name the secret `GEMINI_API_KEY`."
419
  )
420
  else:
421
- gr.Info("Gemini API Key loaded successfully from Space Secrets. Ready to generate!")
422
 
423
  gr.Markdown(
424
- "Convert text to speech using Google's Gemini API. "
425
- "Your Gemini API Key must be set as a Secret named `GEMINI_API_KEY` in this Space's settings."
426
- "\n\nGet your API Key from [Google AI Studio](https://aistudio.google.com/app/apikey)."
427
  )
428
 
429
  with gr.Row():
430
- with gr.Column(scale=2): # Wider column for text inputs
431
- use_file = gr.Checkbox(label="📁 Use Text File Input (.txt)", value=False)
 
432
  text_file = gr.File(
433
- label="Upload Text File", # Simpler label
434
  file_types=['.txt'],
435
- visible=False # Initially hidden
436
  )
437
  text_to_speak = gr.Textbox(
438
- label="📝 Text to Speak (or use file above)",
439
  lines=10,
440
- placeholder="Enter text here...",
441
- visible=True # Initially visible
 
442
  )
443
  use_file.change(
444
  lambda x: (gr.update(visible=x), gr.update(visible=not x)),
@@ -446,105 +392,111 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary
446
  [text_file, text_to_speak]
447
  )
448
  speech_prompt = gr.Textbox(
449
- label="🗣️ Speech Prompt (Optional)",
450
- placeholder="e.g., 'As an energetic YouTuber speaking to an audience'",
451
- info="Influences style, emotion, and voice characteristics."
 
452
  )
453
 
454
- with gr.Column(scale=1):
455
- model_name = gr.Dropdown(
456
- MODELS, label="🤖 Model", value=MODELS[0]
 
 
 
 
 
457
  )
458
- speaker_voice = gr.Dropdown(
459
- SPEAKER_VOICES, label="🎤 Speaker Voice", value="Charon"
 
 
 
 
460
  )
461
- temperature = gr.Slider(
462
- minimum=0.0, maximum=1.0, step=0.05, value=0.7, # Gemini TTS often uses temp <= 1
463
- label="🌡️ Temperature",
464
- info="Controls randomness (0.0-1.0). Higher for more variation."
465
  )
466
- max_chunk_size = gr.Slider(
467
  minimum=1000, maximum=4000, step=100, value=3800,
468
- label="🧩 Max Characters per Chunk",
469
- info="Text is split for API. Max 4096 per request for some models."
470
  )
471
- sleep_between_requests = gr.Slider(
472
- minimum=1, maximum=15, step=0.5, value=2, # Reduced default sleep
473
- label="⏱️ Sleep Between Chunks (sec)",
474
- info="Helps manage API rate limits (e.g. Gemini Flash has 60 RPM limit)."
475
  )
476
- output_filename_base = gr.Textbox(
477
- label="💾 Output Filename Base", value="gemini_tts_audio"
478
  )
479
 
480
- with gr.Group(visible=PYDUB_AVAILABLE):
481
- merge_audio = gr.Checkbox(label="🔗 Merge Audio Chunks (if >1)", value=True)
482
- delete_partials = gr.Checkbox(label="🗑️ Delete Chunks After Merge", value=True, visible=True) # Default visible
483
- merge_audio.change(lambda x: gr.update(visible=x), [merge_audio], [delete_partials])
484
-
485
- if not PYDUB_AVAILABLE:
486
- gr.Markdown("<small>⚠️ Merging disabled: `pydub` library not found. Install if needed.</small>")
 
 
487
 
488
 
489
- submit_button = gr.Button("✨ Generate Audio ✨", variant="primary", scale=2) # Centered button
490
 
 
491
  with gr.Row():
492
  with gr.Column(scale=1):
493
- output_audio_player = gr.Audio(label="🎧 Generated Audio Output", type="filepath", format="wav") # Specify format if known
494
  with gr.Column(scale=1):
495
- output_file_download = gr.File(label="📥 Download Output File", type="filepath")
496
 
497
- status_textbox = gr.Textbox(label="📊 Status Log", lines=10, interactive=False, max_lines=20)
498
 
499
  submit_button.click(
500
  fn=generate_audio_for_gradio,
501
  inputs=[
502
  use_file, text_file, speech_prompt, text_to_speak,
503
- max_chunk_size, sleep_between_requests, temperature,
504
- model_name, speaker_voice, output_filename_base,
505
- merge_audio, delete_partials # Even if not visible, pass them
506
  ],
507
- outputs=[output_audio_player, output_file_download, status_textbox]
508
  )
509
 
510
  gr.Markdown("---")
511
- # The encoded text part:
512
- encoded_text = "Q3JlYXRlIGJ5IDogYWlnb2xkZW4=" # "Created by : aigolden"
 
513
  try:
514
- decoded_text = base64.b64decode(encoded_text.encode('utf-8')).decode('utf-8')
515
- gr.Markdown(f"<p style='text-align:center; font-size:small;'><em>{decoded_text}</em></p>")
516
- except Exception as e_decode:
517
- print(f"Error decoding/displaying credit: {e_decode}")
518
- pass
519
 
520
  gr.Examples(
 
521
  examples=[
522
- [False, None, "A friendly and informative narrator.", "Hello world, this is a test of the Gemini text to speech API using Gradio. I hope this works well!", 3800, 2, 0.7, MODELS[0], "Charon", "example_hello", True, True],
523
- [False, None, "An excited news reporter.", "Breaking news! Artificial intelligence can now generate human-like speech. This technology is rapidly evolving!", 3000, 2, 0.8, MODELS[1], "Achernar", "example_news", True, True],
524
- [True, "sample_text.txt", "A calm storyteller.", "", 3500, 3, 0.6, MODELS[0], "Vindemiatrix", "example_from_file", True, False]
525
  ],
526
- fn=generate_audio_for_gradio, # Ensure example fn is the same as main
527
- inputs=[ # Ensure these match the function's inputs exactly (order and number)
528
  use_file, text_file, speech_prompt, text_to_speak,
529
- max_chunk_size, sleep_between_requests, temperature,
530
- model_name, speaker_voice, output_filename_base,
531
- merge_audio, delete_partials
532
  ],
533
- outputs=[output_audio_player, output_file_download, status_textbox],
534
- cache_examples=False # API calls, so don't cache results based on static inputs
535
  )
536
- gr.Markdown("<small>To use the 'example_from_file', please create a `sample_text.txt` file in the root of this Space with some text content, or upload your own text file.</small>")
537
-
538
 
 
539
  if __name__ == "__main__":
540
  if not PYDUB_AVAILABLE:
541
- print("WARNING: pydub library is not installed or working. Audio file merging will be disabled.")
542
  if not HF_GEMINI_API_KEY:
543
- print("WARNING: GEMINI_API_KEY environment variable not set. The app might not work in local if it relies on this for API key.")
544
 
545
- # For local testing, you might want to provide a way to input the API key
546
- # or set the GEMINI_API_KEY environment variable before running.
547
- # e.g., export GEMINI_API_KEY="your_key_here"
548
- # then run python app.py
549
-
550
- demo.launch(debug=True, share=False) # share=False for local, HF Spaces handles public link
 
8
  import zipfile
9
  from google import genai
10
  from google.genai import types
11
+ import traceback # برای نمایش خطاهای دقیق‌تر
12
 
13
+ # خواندن کلید API از Hugging Face Secrets
 
14
  HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
15
 
16
  try:
 
18
  PYDUB_AVAILABLE = True
19
  except ImportError:
20
  PYDUB_AVAILABLE = False
21
+ print("⚠️ کتابخانه pydub در دسترس نیست. قابلیت ادغام فایل‌های صوتی غیرفعال خواهد بود.")
 
22
 
23
+ # --- ثابت‌ها ---
24
  SPEAKER_VOICES = [
25
  "Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
26
  "Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
 
29
  "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
30
  ]
31
  MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
32
+ MODEL_NAMES_FARSI = {
33
+ "gemini-2.5-flash-preview-tts": "جمینای ۲.۵ فلش (سریع‌تر، اقتصادی‌تر)",
34
+ "gemini-2.5-pro-preview-tts": "جمینای ۲.۵ پرو (کیفیت بالاتر)"
35
+ }
36
+ SPEAKER_VOICES_FARSI_SAMPLE = { # می‌توانید برای همه گوینده‌ها نام فارسی تعریف کنید
37
+ "Charon": "شارون (پیش‌فرض)",
38
+ "Achernar": "آخرالنهر",
39
+ "Vindemiatrix": "vindemiatrix (ستاره‌شناس)",
40
+ # ... بقیه گوینده‌ها
41
+ }
42
+
43
+
44
+ # --- توابع کمکی (بدون تغییر زیاد در منطق، فقط پیام‌ها فارسی می‌شوند) ---
45
  def save_binary_file(file_name, data):
46
  abs_file_name = os.path.abspath(file_name)
47
  try:
48
  with open(abs_file_name, "wb") as f:
49
  f.write(data)
50
+ print(f"✅ فایل در مسیر ذخیره شد: {abs_file_name}")
51
  return abs_file_name
52
  except Exception as e:
53
+ print(f"❌ خطا در ذخیره فایل {abs_file_name}: {e}")
54
  return None
55
 
56
  def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
57
  parameters = parse_audio_mime_type(mime_type)
58
  bits_per_sample = parameters["bits_per_sample"]
59
  sample_rate = parameters["rate"]
60
+ num_channels = 1
61
  data_size = len(audio_data)
62
  bytes_per_sample = bits_per_sample // 8
63
  block_align = num_channels * bytes_per_sample
64
  byte_rate = sample_rate * block_align
65
+ chunk_size = 36 + data_size
 
66
  header = struct.pack(
67
  "<4sI4s4sIHHIIHH4sI",
68
+ b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, 1, num_channels,
69
+ sample_rate, byte_rate, block_align, bits_per_sample, b"data", data_size
 
 
70
  )
71
  return header + audio_data
72
 
73
  def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
74
+ bits_per_sample = 16
75
+ rate = 24000
76
  if mime_type:
77
  mime_type_lower = mime_type.lower()
78
  parts = mime_type_lower.split(";")
79
  for param in parts:
80
  param = param.strip()
81
  if param.startswith("rate="):
82
+ try: rate = int(param.split("=", 1)[1])
83
+ except: pass
84
+ elif param.startswith("audio/l"):
85
  try:
 
 
 
 
 
 
86
  potential_bits = param.split("l", 1)[1]
87
+ if potential_bits.isdigit(): bits_per_sample = int(potential_bits)
88
+ except: pass
 
89
  return {"bits_per_sample": bits_per_sample, "rate": rate}
90
 
91
  def load_text_from_gr_file(file_obj):
92
  if file_obj is None:
93
+ return "", "فایلی برای ورودی متن ارائه نشده است."
94
  try:
95
  with open(file_obj.name, 'r', encoding='utf-8') as f:
96
  content = f.read().strip()
97
  if not content:
98
+ return "", "فایل متنی خالی است."
99
+ return content, f"متن با موفقیت از فایل '{os.path.basename(file_obj.name)}' ({len(content)} کاراکتر) بارگذاری شد."
100
  except Exception as e:
101
+ return "", f"خطا در خواندن فایل متنی: {e}"
102
 
103
  def smart_text_split(text, max_size=3800):
104
+ if len(text) <= max_size: return [text]
105
+ chunks, current_chunk = [], ""
106
+ sentences = re.split(r'(?<=[.!?؟])\s+', text)
 
 
107
  for sentence in sentences:
108
  if not sentence: continue
109
+ if len(current_chunk) + len(sentence) + 1 > max_size:
110
+ if current_chunk: chunks.append(current_chunk.strip())
 
 
 
 
 
111
  if len(sentence) > max_size:
112
+ words, temp_part = sentence.split(' '), ""
 
113
  for word in words:
114
+ if len(temp_part) + len(word) + 1 > max_size:
115
+ if temp_part: chunks.append(temp_part.strip())
 
116
  if len(word) > max_size:
117
+ for i in range(0, len(word), max_size): chunks.append(word[i:i+max_size])
118
+ temp_part = ""
119
+ else: temp_part = word
120
+ else: temp_part += (" " if temp_part else "") + word
121
+ if temp_part: chunks.append(temp_part.strip())
122
+ current_chunk = ""
123
+ else: current_chunk = sentence
124
+ else: current_chunk += (" " if current_chunk else "") + sentence
125
+ if current_chunk: chunks.append(current_chunk.strip())
 
 
 
 
 
 
 
126
  return chunks
127
 
 
128
  def merge_audio_files_func(file_paths, output_path):
129
+ if not PYDUB_AVAILABLE: return False, "pydub در دسترس نیست. امکان ادغام فایل‌ها وجود ندارد.", None
130
+ if not file_paths: return False, "هیچ فایل صوتی برای ادغام وجود ندارد.", None
 
 
131
  try:
132
  combined = AudioSegment.empty()
133
  for i, file_path in enumerate(file_paths):
134
  if os.path.exists(file_path):
135
  try:
136
+ audio = AudioSegment.from_file(file_path, format="wav") # فرض می‌کنیم ورودی‌ها WAV هستند
 
 
137
  combined += audio
138
+ if i < len(file_paths) - 1: combined += AudioSegment.silent(duration=200)
 
139
  except Exception as e_load:
140
+ msg = f"خطا در بارگذاری فایل صوتی '{os.path.basename(file_path)}' با pydub: {e_load}"
141
+ print(f"⚠️ {msg}")
142
+ return False, msg, None
143
  else:
144
+ msg = f"فایل برای ادغام یافت نشد: {os.path.basename(file_path)}"
145
+ print(f"⚠️ {msg}")
146
+ return False, msg, None
 
147
  abs_output_path = os.path.abspath(output_path)
148
  combined.export(abs_output_path, format="wav")
149
+ return True, f"فایل ادغام شده با موفقیت در '{os.path.basename(abs_output_path)}' ذخیره شد.", abs_output_path
150
  except Exception as e:
151
+ msg = f"خطا در ادغام فایل‌ها: {e}"
152
+ print(f" {msg}")
153
+ return False, msg, None
154
 
155
  def create_zip_file(file_paths, zip_name):
156
  abs_zip_name = os.path.abspath(zip_name)
 
159
  for file_path in file_paths:
160
  if os.path.exists(file_path):
161
  zipf.write(file_path, os.path.basename(file_path))
162
+ return True, f"فایل ZIP با نام '{os.path.basename(abs_zip_name)}' ایجاد شد.", abs_zip_name
163
  except Exception as e:
164
+ return False, f"خطا در ایجاد فایل ZIP: {e}", None
165
 
166
+ # --- تابع اصلی تولید صدا ---
167
  def generate_audio_for_gradio(
 
168
  use_file_input_checkbox, text_file_obj,
169
  speech_prompt_input, text_to_speak_input,
170
  max_chunk_slider, sleep_slider, temperature_slider,
171
+ model_dropdown_key, # کلید مدل (انگلیسی)
172
+ speaker_dropdown, output_filename_base_input,
173
  merge_checkbox, delete_partials_checkbox,
 
174
  progress=gr.Progress(track_tqdm=True)
175
  ):
176
  status_messages = []
177
+ status_messages.append("🚀 فرآیند تبدیل متن به گفتار آغاز شد...")
178
+ progress(0, desc="در حال آماده‌سازی...")
179
 
 
180
  api_key_to_use = HF_GEMINI_API_KEY
181
  if not api_key_to_use:
182
+ status_messages.append("❌ خطا: کلید API جمینای (GEMINI_API_KEY) در تنظیمات Secret این Space یافت نشد.")
183
+ status_messages.append("⬅️ لطفاً آن را در بخش Settings > Secrets مربوط به این Space تنظیم کنید.")
 
 
 
184
  return None, None, "\n".join(status_messages)
185
 
186
+ os.environ["GEMINI_API_KEY"] = api_key_to_use
187
+ status_messages.append("🔑 کلید API با موفقیت از Secrets بارگذاری شد.")
188
 
 
189
  actual_text_input = ""
190
  if use_file_input_checkbox:
191
  if text_file_obj is None:
192
+ status_messages.append("❌ خطا: گزینه 'استفاده از فایل متنی' انتخاب شده، اما هیچ فایلی آپلود نشده است.")
193
  return None, None, "\n".join(status_messages)
194
  actual_text_input, msg = load_text_from_gr_file(text_file_obj)
195
  status_messages.append(msg)
196
+ if not actual_text_input: return None, None, "\n".join(status_messages)
 
197
  else:
198
  actual_text_input = text_to_speak_input
199
+ status_messages.append("⌨️ از متن وارد شده به صورت دستی استفاده می‌شود.")
200
 
201
  if not actual_text_input or actual_text_input.strip() == "":
202
+ status_messages.append("❌ خطا: متن ورودی خالی است.")
203
  return None, None, "\n".join(status_messages)
204
 
 
205
  try:
206
+ status_messages.append("🛠️ در حال مقداردهی اولیه کلاینت جمینای...")
207
+ progress(0.1, desc="اتصال به جمینای...")
208
  client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
209
+ status_messages.append("✅ کلاینت جمینای با موفقیت ایجاد شد.")
210
  except Exception as e:
211
+ status_messages.append(f"❌ خطا در ایجاد کلاینت جمینای: {e}")
212
  return None, None, "\n".join(status_messages)
213
 
 
214
  text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
215
+ status_messages.append(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.")
216
+ for i, chunk_text_content in enumerate(text_chunks):
217
+ status_messages.append(f" 📝 قطعه {i+1}: {len(chunk_text_content)} کاراکتر")
218
 
 
219
  generated_audio_files = []
220
  run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
221
  temp_output_dir = f"temp_audio_{run_id}"
222
  os.makedirs(temp_output_dir, exist_ok=True)
223
+ output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input)
224
 
225
  total_chunks = len(text_chunks)
226
  for i, chunk_text_content in enumerate(text_chunks):
227
+ progress_val = 0.1 + (0.7 * (i / total_chunks))
228
+ progress(progress_val, desc=f"در حال تولید قطعه {i+1} از {total_chunks}...")
229
 
230
+ status_messages.append(f"\n🔊 در حال تولید صدا برای قطعه {i+1}/{total_chunks}...")
231
  final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
232
 
233
  contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
 
236
  response_modalities=["audio"],
237
  speech_config=types.SpeechConfig(
238
  voice_config=types.VoiceConfig(
239
+ prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=speaker_dropdown) # گوینده از ورودی
240
  )
241
  )
242
  )
243
  try:
244
  chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
245
  chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
 
246
  audio_data_received = False
247
  for stream_response_chunk in client.models.generate_content_stream(
248
+ model=model_dropdown_key, contents=contents_for_api, config=generate_content_config,
249
  ):
250
  if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
251
  stream_response_chunk.candidates[0].content.parts and
 
255
  data_buffer = inline_data.data
256
  api_mime_type = inline_data.mime_type
257
  audio_data_received = True
258
+ status_messages.append(f"ℹ️ MIME Type دریافتی از API: {api_mime_type}")
 
259
 
260
+ file_extension = ".wav"
 
261
  if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
262
  file_extension = ".mp3"
263
+ status_messages.append(f"ℹ️ ذخیره با فرمت MP3 بر اساس MIME Type: {api_mime_type}")
 
 
264
  elif api_mime_type and "wav" in api_mime_type.lower() and \
265
  not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
266
  file_extension = ".wav"
267
+ status_messages.append(f"ℹ️ ذخیره با فرمت WAV بر اساس MIME Type: {api_mime_type}")
268
+ else:
 
269
  file_extension = ".wav"
270
+ status_messages.append(f"ℹ️ تبدیل به فرمت WAV برای MIME Type: {api_mime_type or 'نامشخص'}")
271
  data_buffer = convert_to_wav(data_buffer, api_mime_type)
272
 
273
+ status_messages.append(f"ℹ️ پسوند فایل نهایی: {file_extension}")
274
 
275
  generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
276
  if generated_file_path:
277
  generated_audio_files.append(generated_file_path)
278
+ status_messages.append(f"✅ قطعه {i+1} ذخیره شد: {os.path.basename(generated_file_path)}")
279
  else:
280
+ status_messages.append(f"❌ عدم موفقیت در ذخیره قطعه {i+1}.")
281
+ break
 
282
  elif stream_response_chunk.text:
283
+ status_messages.append(f"ℹ️ پیام متنی از API (حین استریم): {stream_response_chunk.text}")
284
 
285
  if not audio_data_received:
286
+ status_messages.append(f"❌ هیچ داده صوتی برای قطعه {i+1} دریافت نشد.")
 
287
  if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
288
+ status_messages.append(f"🛑 دلیل مسدود شدن توسط API: {stream_response_chunk.prompt_feedback.block_reason_message or stream_response_chunk.prompt_feedback.block_reason}")
 
 
289
  except types.BlockedPromptException as bpe:
290
+ status_messages.append(f"❌ محتوای قطعه {i+1} توسط API مسدود شد: {bpe}")
291
+ status_messages.append(f" بازخورد API: {bpe.response.prompt_feedback}")
292
  except types.StopCandidateException as sce:
293
+ status_messages.append(f"❌ تولید صدا برای قطعه {i+1} متوقف شد: {sce}")
294
+ status_messages.append(f" بازخورد API: {sce.response.prompt_feedback}")
295
  except Exception as e:
296
+ status_messages.append(f"❌ خطا در تولید/پردازش قطعه {i+1}: {type(e).__name__} - {e}")
297
+ status_messages.append(traceback.format_exc())
298
+ continue
 
299
 
300
+ if i < total_chunks - 1 and float(sleep_slider) > 0 :
301
+ status_messages.append(f"⏱️ انتظار به مدت {sleep_slider} ثانیه...")
302
  time.sleep(float(sleep_slider))
303
 
304
+ progress(0.85, desc="پردازش فایل‌های نهایی...")
 
305
  if not generated_audio_files:
306
+ status_messages.append("❌ هیچ فایل صوتی با موفقیت تولید یا ذخیره نشد!")
307
  final_status = "\n".join(status_messages)
308
  print(final_status)
309
+ progress(1, desc="پایان با خطا.")
310
  return None, None, final_status
311
 
312
+ status_messages.append(f"\n🎉 {len(generated_audio_files)} فایل(های) صوتی تولید شد!")
313
+
314
+ output_audio_path_for_player = None
315
+ output_path_for_download = None
316
+
317
+ if merge_checkbox and len(generated_audio_files) > 1 and PYDUB_AVAILABLE:
318
+ status_messages.append(f"🔗 در حال ادغام {len(generated_audio_files)} فایل صوتی...")
319
+ merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
320
+ success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
321
+ status_messages.append(msg_merge)
322
+ if success_merge:
323
+ output_audio_path_for_player = merged_p
324
+ output_path_for_download = merged_p
325
+ if delete_partials_checkbox:
326
+ status_messages.append("🗑️ در حال حذف فایل‌های جزئی...")
327
+ for file_p in generated_audio_files:
328
+ try: os.remove(file_p); status_messages.append(f" 🗑️ حذف شد: {os.path.basename(file_p)}")
329
+ except Exception as e_del: status_messages.append(f" ⚠️ عدم موفقیت در حذف {os.path.basename(file_p)}: {e_del}")
330
+ else:
331
+ status_messages.append("⚠️ ادغام ناموفق بود. فایل ZIP از قطعات ارائه می‌شود.")
332
  success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
333
  status_messages.append(msg_zip)
334
  if success_zip: output_path_for_download = zip_p
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  elif len(generated_audio_files) == 1:
 
336
  single_file_path = generated_audio_files[0]
337
+ output_audio_path_for_player = single_file_path
 
 
 
 
 
 
 
 
338
  output_path_for_download = single_file_path
339
+ status_messages.append(f"🎵 فایل صوتی تکی: {os.path.basename(single_file_path)}")
340
+ elif len(generated_audio_files) > 1: # No merge or pydub not available
341
+ if not PYDUB_AVAILABLE and merge_checkbox:
342
+ status_messages.append("⚠️ pydub در دسترس نیست، امکان ادغام وجود ندارد. فایل ZIP ارائه می‌شود.")
343
+ status_messages.append("📦 چندین قطعه تولید شد. در حال ایجاد فایل ZIP...")
344
  success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
345
  status_messages.append(msg_zip)
346
  if success_zip: output_path_for_download = zip_p
347
+
348
  final_status = "\n".join(status_messages)
349
  print(final_status)
350
+ print(f"DEBUG مسیر فایل برای پخش کننده: {output_audio_path_for_player}")
351
+ print(f"DEBUG مسیر فایل برای دانلود: {output_path_for_download}")
352
+ progress(1, desc="انجام شد!")
353
  return output_audio_path_for_player, output_path_for_download, final_status
354
 
355
+ # --- تعریف رابط کاربری Gradio ---
356
+ with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky), title="تبدیل متن به گفتار با Gemini") as demo:
357
+ gr.Markdown("# 🎵 تبدیل متن به گفتار با Gemini API 🗣️", elem_id="app-title")
358
+
359
  if not HF_GEMINI_API_KEY:
360
  gr.Warning(
361
+ "کلید API جمینای (GEMINI_API_KEY) در Hugging Face Secrets یافت نشد. "
362
+ "لطفاً آن را در بخش 'Settings' > 'Secrets' این Space با نام `GEMINI_API_KEY` اضافه کنید تا برنامه کار کند."
 
363
  )
364
  else:
365
+ gr.Info("کلید API جمینای با موفقیت از Secrets بارگذاری شد. آماده تولید صدا!")
366
 
367
  gr.Markdown(
368
+ "این ابزار متن شما را با استفاده از API قدرتمند Gemini گوگل به گفتار تبدیل می‌کند. "
369
+ "برای استفاده، باید کلید API جمینای خود را در بخش Secrets این Space تنظیم کرده باشید."
370
+ "\n\nمی‌توانید کلید API خود را از [استودیوی هوش مصنوعی گوگل (Google AI Studio)](https://aistudio.google.com/app/apikey) دریافت کنید."
371
  )
372
 
373
  with gr.Row():
374
+ with gr.Column(scale=3): # ستون ورودی متن
375
+ gr.Markdown("### ۱. متن ورودی")
376
+ use_file = gr.Checkbox(label="📁 استفاده از فایل متنی (.txt) به جای ورود دستی", value=False)
377
  text_file = gr.File(
378
+ label="آپلود فایل متنی",
379
  file_types=['.txt'],
380
+ visible=False
381
  )
382
  text_to_speak = gr.Textbox(
383
+ label="📝 متنی که می‌خواهید به گفتار تبدیل شود:",
384
  lines=10,
385
+ placeholder="متن خود را در اینجا وارد کنید یا فایل متنی را در بالا آپلود نمایید...",
386
+ visible=True,
387
+ text_align="right" # برای متون فارسی
388
  )
389
  use_file.change(
390
  lambda x: (gr.update(visible=x), gr.update(visible=not x)),
 
392
  [text_file, text_to_speak]
393
  )
394
  speech_prompt = gr.Textbox(
395
+ label="🗣️ فرمان سبک گفتار (اختیاری)",
396
+ placeholder="مثال: «با لحنی دوستانه و پرانرژی، مانند یک مجری پادکست صحبت کن»",
397
+ info="این فرمان به تنظیم سبک، احساسات و ویژگی‌های صدای خروجی کمک می‌کند.",
398
+ text_align="right"
399
  )
400
 
401
+ with gr.Column(scale=2): # ستون تنظیمات
402
+ gr.Markdown("### ۲. تنظیمات تولید صدا")
403
+ # تبدیل دیکشنری نام‌های فارسی به لیست تاپل‌ها برای Dropdown
404
+ model_choices_farsi = [(MODEL_NAMES_FARSI[key], key) for key in MODELS]
405
+ model_name_dropdown = gr.Dropdown(
406
+ choices=model_choices_farsi, # نمایش نام فارسی، ارسال کلید انگلیسی
407
+ label="🤖 انتخاب مدل Gemini",
408
+ value=MODELS[0] # مقدار پیش‌فرض کلید انگلیسی
409
  )
410
+
411
+ speaker_choices_farsi = [(SPEAKER_VOICES_FARSI_SAMPLE.get(v, v), v) for v in SPEAKER_VOICES]
412
+ speaker_voice_dropdown = gr.Dropdown(
413
+ choices=speaker_choices_farsi,
414
+ label="🎤 انتخاب گوینده",
415
+ value="Charon"
416
  )
417
+ temperature_slider = gr.Slider(
418
+ minimum=0.0, maximum=1.0, step=0.05, value=0.7,
419
+ label="🌡️ دمای مدل (Temperature)",
420
+ info="میزان خلاقیت و تنوع در خروجی (0.0 تا 1.0). مقادیر بالاتر = تنوع بیشتر."
421
  )
422
+ max_chunk_size_slider = gr.Slider(
423
  minimum=1000, maximum=4000, step=100, value=3800,
424
+ label="🧩 حداکثر کاراکتر در هر قطعه",
425
+ info="متن برای ارسال به API به قطعات کوچکتر تقسیم می‌شود."
426
  )
427
+ sleep_between_requests_slider = gr.Slider(
428
+ minimum=0, maximum=15, step=0.5, value=1, # کاهش مقدار پیش‌فرض برای سرعت بیشتر
429
+ label="⏱️ تاخیر بین درخواست‌ها (ثانیه)",
430
+ info="برای مدیریت محدودیت‌های API (مثلاً Gemini Flash دارای محدودیت ۶۰ درخواست در دقیقه است)."
431
  )
432
+ output_filename_base_input = gr.Textbox(
433
+ label="💾 نام پایه فایل خروجی", value="gemini_tts_farsi"
434
  )
435
 
436
+ with gr.Group(elem_id="merge-options"):
437
+ gr.Markdown("تنظیمات ادغام (در صورت تولید بیش از یک قطعه):")
438
+ merge_audio_checkbox = gr.Checkbox(label="🔗 ادغام قطعات صوتی", value=True, visible=PYDUB_AVAILABLE)
439
+ delete_partials_checkbox = gr.Checkbox(label="🗑️ حذف قطعات پس از ادغام", value=True, visible=PYDUB_AVAILABLE and True) # نمایش اگر ادغام فعال و pydub موجود باشد
440
+
441
+ if PYDUB_AVAILABLE:
442
+ merge_audio_checkbox.change(lambda x: gr.update(visible=x), [merge_audio_checkbox], [delete_partials_checkbox])
443
+ else:
444
+ gr.Markdown("<p style='color:orange; font-size:small;'>⚠️ قابلیت ادغام فایل‌ها به دلیل عدم دسترسی به کتابخانه `pydub` غیرفعال است.</p>")
445
 
446
 
447
+ submit_button = gr.Button("✨ تولید فایل صوتی ✨", variant="primary", elem_id="submit-button-main")
448
 
449
+ gr.Markdown("### ۳. خروجی")
450
  with gr.Row():
451
  with gr.Column(scale=1):
452
+ output_audio_player_component = gr.Audio(label="🎧 فایل صوتی تولید شده (قابل پخش)", type="filepath")
453
  with gr.Column(scale=1):
454
+ output_file_download_component = gr.File(label="📥 دانلود فایل خروجی (صوتی یا ZIP)", type="filepath")
455
 
456
+ status_textbox_component = gr.Textbox(label="📊 گزارش وضعیت و پیام‌ها", lines=10, interactive=False, max_lines=20, text_align="right")
457
 
458
  submit_button.click(
459
  fn=generate_audio_for_gradio,
460
  inputs=[
461
  use_file, text_file, speech_prompt, text_to_speak,
462
+ max_chunk_size_slider, sleep_between_requests_slider, temperature_slider,
463
+ model_name_dropdown, speaker_voice_dropdown, output_filename_base_input,
464
+ merge_audio_checkbox, delete_partials_checkbox
465
  ],
466
+ outputs=[output_audio_player_component, output_file_download_component, status_textbox_component]
467
  )
468
 
469
  gr.Markdown("---")
470
+ # اطلاعات سازنده
471
+ encoded_text_creator = "Q3JlYXRlZCBieSA6IEhhbWVkNzQ0IChBSUdPTERFTikgZm9yIEh1Z2dpbmcgRmFjZSBTcGFjZXMu"
472
+ # "Created by : Hamed744 (AIGOLDEN) for Hugging Face Spaces."
473
  try:
474
+ decoded_text_creator = base64.b64decode(encoded_text_creator.encode('utf-8')).decode('utf-8')
475
+ gr.Markdown(f"<p style='text-align:center; font-size:small; color:grey;'><em>{decoded_text_creator}</em></p>")
476
+ except Exception: pass
 
 
477
 
478
  gr.Examples(
479
+ label="چند مثال برای شروع:",
480
  examples=[
481
+ [False, None, "یک راوی مهربان و آموزنده.", "سلام دنیا! این یک آزمایش برای تبدیل متن به گفتار با جمینای در گریدیا است. امیدوارم خوب کار کند!", 3800, 1, 0.7, MODELS[0], "Charon", "hello_farsi", True, True],
482
+ [False, None, "یک گزارشگر خبری هیجان‌زده.", "خبر فوری! هوش مصنوعی اکنون می‌تواند گفتاری شبیه به انسان با وضوح باورنکردنی تولید کند. این فناوری به سر��ت در حال پیشرفت است!", 3000, 1, 0.8, MODELS[1], "Achernar", "news_farsi", True, True],
 
483
  ],
484
+ fn=generate_audio_for_gradio,
485
+ inputs=[
486
  use_file, text_file, speech_prompt, text_to_speak,
487
+ max_chunk_size_slider, sleep_between_requests_slider, temperature_slider,
488
+ model_name_dropdown, speaker_voice_dropdown, output_filename_base_input,
489
+ merge_audio_checkbox, delete_partials_checkbox
490
  ],
491
+ outputs=[output_audio_player_component, output_file_download_component, status_textbox_component],
492
+ cache_examples=False # چون با API کار می‌کند و ورودی‌ها داینامیک هستند
493
  )
 
 
494
 
495
+ # اجرای برنامه در صورت اجرای مستقیم فایل (برای تست محلی)
496
  if __name__ == "__main__":
497
  if not PYDUB_AVAILABLE:
498
+ print("هشدار: کتابخانه pydub نصب نشده یا کار نمی‌کند. قابلیت ادغام فایل‌های صوتی غیرفعال خواهد بود.")
499
  if not HF_GEMINI_API_KEY:
500
+ print("هشدار: متغیر محیطی GEMINI_API_KEY تنظیم نشده است. اگر برنامه برای کلید API به آن متکی باشد، ممکن است در حالت محلی کار نکند.")
501
 
502
+ demo.launch(debug=True, share=False) # share=False برای اجرای محلی، هاگینگ فیس لینک عمومی را مدیریت می‌کند