Hamed744 commited on
Commit
e2b4736
·
verified ·
1 Parent(s): b914b1f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -28
app.py CHANGED
@@ -8,7 +8,7 @@ import time
8
  import zipfile
9
  import traceback # For detailed error logging if needed
10
  from google import genai
11
- from google.genai import types as genai_types # Renamed to avoid conflict with built-in types
12
 
13
  try:
14
  from pydub import AudioSegment
@@ -40,10 +40,10 @@ def save_binary_file(file_name, data, log_func, current_logs):
40
  full_path = os.path.join(OUTPUT_DIR, file_name)
41
  try:
42
  with open(full_path, "wb") as f: f.write(data)
43
- current_logs = log_func(f"✅ فایل در مسیر زیر ذخیره شد: {full_path}", current_logs)
44
  return full_path, current_logs
45
  except Exception as e:
46
- current_logs = log_func(f"❌ خطا در ذخیره فایل {file_name}: {e}", current_logs)
47
  return None, current_logs
48
 
49
  def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
@@ -70,14 +70,14 @@ def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
70
  return {"bits_per_sample": bits_per_sample, "rate": rate}
71
 
72
  def load_text_from_file(file_obj, log_func, current_logs):
73
- if file_obj is None: return "", log_func("❌ هیچ فایلی آپلود نشد.", current_logs)
74
  file_path = file_obj.name
75
- current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}' دریافت شد.", current_logs)
76
  try:
77
  with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
78
  current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
79
  return content, current_logs
80
- except Exception as e: return "", log_func(f"❌ خطا در خواندن فایل: {e}", current_logs)
81
 
82
  def smart_text_split(text, max_size=3800):
83
  if len(text) <= max_size: return [text]
@@ -168,16 +168,22 @@ def generate_audio_from_text_gradio(
168
 
169
  text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
170
  logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
171
- for i, chunk_text in enumerate(text_chunks): # Renamed chunk to chunk_text
172
  logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
173
 
174
  generated_files = []
175
- for i, chunk_text_for_api in enumerate(text_chunks): # Use the chunk_text directly
176
  logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
177
 
178
- # IMPORTANT CHANGE: final_text_for_api is now just the chunk
179
- # The speech_prompt_ui is NOT added to the text for these models/API calls
180
- final_text_for_api = chunk_text_for_api
 
 
 
 
 
 
181
 
182
  api_contents = [
183
  genai_types.Content(
@@ -219,8 +225,9 @@ def generate_audio_from_text_gradio(
219
  mime_type_from_api = inline_data.mime_type
220
  elif chunk_response.text:
221
  log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
222
- if "error" in chunk_response.text.lower() or "failed" in chunk_response.text.lower():
223
- logs = log_message(f" {log_text}", logs)
 
224
  else:
225
  logs = log_message(f"ℹ️ {log_text}", logs)
226
 
@@ -246,11 +253,15 @@ def generate_audio_from_text_gradio(
246
  generated_files.append(saved_file_path)
247
  logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
248
  else:
249
- if not f"❌ پیام API قطعه {i+1}" in logs: # Avoid duplicate error if API already sent one
250
  logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
251
 
252
  except Exception as e:
253
  error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
 
 
 
 
254
  if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
255
  elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
256
  elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
@@ -301,7 +312,7 @@ def generate_audio_from_text_gradio(
301
  return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
302
  return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
303
 
304
- # --- Gradio UI (Largely unchanged, ensure default values are correct) ---
305
  css = """
306
  body { direction: rtl; }
307
  .rtl_override { direction: rtl !important; text-align: right !important; }
@@ -311,34 +322,30 @@ footer { display: none !important; }
311
  .gradio-container { max-width: 800px !important; margin: auto !important; }
312
  """
313
  API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
314
-
315
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
316
  gr.Markdown(
317
  """
318
  <div style='text-align: center; font-family: "Arial", sans-serif;'>
319
  <h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
320
- <p class='rtl_override'>توجه: قابلیت "پرامپت سبک گفتار" فعلا برای این مدل‌ها به طور کامل پشتیبانی نمی‌شود.</p>
321
  </div>
322
  """
323
- ) # Added a note about speech prompt
324
  api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
325
  if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
326
  gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
327
-
328
  with gr.Row():
329
  with gr.Column(scale=2):
330
  gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
331
  input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
332
  text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
333
  uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
334
- speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری - فعلا تاثیر محدود)", placeholder="مثال: شاد و پر انرژی", lines=2, elem_classes="rtl_override") # Clarified limited effect
335
-
336
  gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
337
  model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
338
- speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override") # Default Charon
339
  temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
340
  output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
341
-
342
  with gr.Column(scale=1):
343
  gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
344
  max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
@@ -347,24 +354,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"),
347
  pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
348
  del_partial_lbl = f"حذف فایل‌های جزئی{pydub_warn_lbl}"
349
  delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
350
-
351
  submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
352
  gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
353
  status_output_area = gr.Textbox(label="پیام‌های وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
354
-
355
  with gr.Row():
356
  audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
357
  zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
358
-
359
  def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
360
  input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
361
  def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
362
  merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
363
-
364
  def trigger_generation_with_api_key(*args_from_ui):
365
  hf_secret_key = os.environ.get("GEMINI_API_KEY")
366
  return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
367
-
368
  submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
369
  submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
370
  submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)
 
8
  import zipfile
9
  import traceback # For detailed error logging if needed
10
  from google import genai
11
+ from google.genai import types as genai_types
12
 
13
  try:
14
  from pydub import AudioSegment
 
40
  full_path = os.path.join(OUTPUT_DIR, file_name)
41
  try:
42
  with open(full_path, "wb") as f: f.write(data)
43
+ current_logs = log_func(f"✅ فایل: {full_path}", current_logs)
44
  return full_path, current_logs
45
  except Exception as e:
46
+ current_logs = log_func(f"❌ خطا ذخیره {file_name}: {e}", current_logs)
47
  return None, current_logs
48
 
49
  def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
 
70
  return {"bits_per_sample": bits_per_sample, "rate": rate}
71
 
72
  def load_text_from_file(file_obj, log_func, current_logs):
73
+ if file_obj is None: return "", log_func("❌ فایل آپلود نشد.", current_logs)
74
  file_path = file_obj.name
75
+ current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}' دریافت.", current_logs)
76
  try:
77
  with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
78
  current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
79
  return content, current_logs
80
+ except Exception as e: return "", log_func(f"❌ خطا خواندن فایل: {e}", current_logs)
81
 
82
  def smart_text_split(text, max_size=3800):
83
  if len(text) <= max_size: return [text]
 
168
 
169
  text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
170
  logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
171
+ for i, chunk_text in enumerate(text_chunks):
172
  logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
173
 
174
  generated_files = []
175
+ for i, chunk_for_api in enumerate(text_chunks):
176
  logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
177
 
178
+ # REVERTING to adding speech_prompt to the text, as per Colab's presumed successful logic
179
+ # Using a simple concatenation. The Colab might have had a more specific format.
180
+ # If speech_prompt_ui is "شاد و پر انرژی" and chunk_for_api is "سلام دنیا"
181
+ # final_text_for_api will be "شاد و پر انرژی\nسلام دنیا"
182
+ if speech_prompt_ui and speech_prompt_ui.strip():
183
+ final_text_for_api = f"{speech_prompt_ui.strip()}\n{chunk_for_api}"
184
+ logs = log_message(f"ℹ️ پرامپت سبک '{speech_prompt_ui.strip()}' به متن اضافه شد.", logs)
185
+ else:
186
+ final_text_for_api = chunk_for_api
187
 
188
  api_contents = [
189
  genai_types.Content(
 
225
  mime_type_from_api = inline_data.mime_type
226
  elif chunk_response.text:
227
  log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
228
+ # Check if it's an error that might indicate the prompt was misunderstood or caused an issue
229
+ if "error" in chunk_response.text.lower() or "failed" in chunk_response.text.lower() or "invalid input" in chunk_response.text.lower():
230
+ logs = log_message(f"❌ {log_text} (ممکن است به دلیل پرامپت سبک باشد)", logs)
231
  else:
232
  logs = log_message(f"ℹ️ {log_text}", logs)
233
 
 
253
  generated_files.append(saved_file_path)
254
  logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
255
  else:
256
+ if not f"❌ پیام API قطعه {i+1}" in logs:
257
  logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
258
 
259
  except Exception as e:
260
  error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
261
+ # Check if the error message from API (if any in e.args) mentions input format or similar
262
+ if hasattr(e, 'args') and e.args and isinstance(e.args[0], str) and ("input" in e.args[0].lower() or "parse" in e.args[0].lower()):
263
+ error_msg += "\n (ممکن است خطا به دلیل فرمت پرامپت سبک الحاق شده به متن باشد)"
264
+
265
  if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
266
  elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
267
  elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
 
312
  return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
313
  return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
314
 
315
+ # --- Gradio UI (unchanged) ---
316
  css = """
317
  body { direction: rtl; }
318
  .rtl_override { direction: rtl !important; text-align: right !important; }
 
322
  .gradio-container { max-width: 800px !important; margin: auto !important; }
323
  """
324
  API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
 
325
  with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
326
  gr.Markdown(
327
  """
328
  <div style='text-align: center; font-family: "Arial", sans-serif;'>
329
  <h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
330
+ <p class='rtl_override'>توجه: تاثیر "پرامپت سبک گفتار" به نحوه تفسیر مدل بستگی دارد.</p>
331
  </div>
332
  """
333
+ )
334
  api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
335
  if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
336
  gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
 
337
  with gr.Row():
338
  with gr.Column(scale=2):
339
  gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
340
  input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
341
  text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
342
  uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
343
+ speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری)", placeholder="مثال: شاد و پر انرژی", lines=2, elem_classes="rtl_override")
 
344
  gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
345
  model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
346
+ speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override")
347
  temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
348
  output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
 
349
  with gr.Column(scale=1):
350
  gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
351
  max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
 
354
  pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
355
  del_partial_lbl = f"حذف فایل‌های جزئی{pydub_warn_lbl}"
356
  delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
 
357
  submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
358
  gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
359
  status_output_area = gr.Textbox(label="پیام‌های وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
 
360
  with gr.Row():
361
  audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
362
  zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
 
363
  def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
364
  input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
365
  def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
366
  merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
 
367
  def trigger_generation_with_api_key(*args_from_ui):
368
  hf_secret_key = os.environ.get("GEMINI_API_KEY")
369
  return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
 
370
  submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
371
  submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
372
  submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)