Spaces:

Hamed744
/

Ttspro

Running

App Files Files Community

Hamed744 commited on Jun 5

Commit

e2b4736

verified ·

1 Parent(s): b914b1f

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -28

app.py CHANGED Viewed

@@ -8,7 +8,7 @@ import time
 import zipfile
 import traceback # For detailed error logging if needed
 from google import genai
-from google.genai import types as genai_types # Renamed to avoid conflict with built-in types
 try:
     from pydub import AudioSegment
@@ -40,10 +40,10 @@ def save_binary_file(file_name, data, log_func, current_logs):
     full_path = os.path.join(OUTPUT_DIR, file_name)
     try:
         with open(full_path, "wb") as f: f.write(data)
-        current_logs = log_func(f"✅ فایل در مسیر زیر ذخیره شد: {full_path}", current_logs)
         return full_path, current_logs
     except Exception as e:
-        current_logs = log_func(f"❌ خطا در ذخیره فایل {file_name}: {e}", current_logs)
         return None, current_logs
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
@@ -70,14 +70,14 @@ def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def load_text_from_file(file_obj, log_func, current_logs):
-    if file_obj is None: return "", log_func("❌ هیچ فایلی آپلود نشد.", current_logs)
     file_path = file_obj.name
-    current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}' دریافت شد.", current_logs)
     try:
         with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
         current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
         return content, current_logs
-    except Exception as e: return "", log_func(f"❌ خطا در خواندن فایل: {e}", current_logs)
 def smart_text_split(text, max_size=3800):
     if len(text) <= max_size: return [text]
@@ -168,16 +168,22 @@ def generate_audio_from_text_gradio(
     text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
     logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
-    for i, chunk_text in enumerate(text_chunks): # Renamed chunk to chunk_text
         logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
     generated_files = []
-    for i, chunk_text_for_api in enumerate(text_chunks): # Use the chunk_text directly
         logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
-        # IMPORTANT CHANGE: final_text_for_api is now just the chunk
-        # The speech_prompt_ui is NOT added to the text for these models/API calls
-        final_text_for_api = chunk_text_for_api
         api_contents = [
             genai_types.Content(
@@ -219,8 +225,9 @@ def generate_audio_from_text_gradio(
                     mime_type_from_api = inline_data.mime_type
                 elif chunk_response.text:
                     log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
-                    if "error" in chunk_response.text.lower() or "failed" in chunk_response.text.lower():
-                        logs = log_message(f"❌ {log_text}", logs)
                     else:
                         logs = log_message(f"ℹ️ {log_text}", logs)
@@ -246,11 +253,15 @@ def generate_audio_from_text_gradio(
                     generated_files.append(saved_file_path)
                     logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
             else:
-                if not f"❌ پیام API قطعه {i+1}" in logs: # Avoid duplicate error if API already sent one
                     logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
         except Exception as e:
             error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
             if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
             elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
             elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
@@ -301,7 +312,7 @@ def generate_audio_from_text_gradio(
          return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
     return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
-# --- Gradio UI (Largely unchanged, ensure default values are correct) ---
 css = """
 body { direction: rtl; }
 .rtl_override { direction: rtl !important; text-align: right !important; }
@@ -311,34 +322,30 @@ footer { display: none !important; }
 .gradio-container { max-width: 800px !important; margin: auto !important; }
 """
 API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
     gr.Markdown(
         """
         <div style='text-align: center; font-family: "Arial", sans-serif;'>
             <h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
-            <p class='rtl_override'>توجه: قابلیت "پرامپت سبک گفتار" فعلا برای این مدل‌ها به طور کامل پشتیبانی نمی‌شود.</p>
         </div>
         """
-    ) # Added a note about speech prompt
     api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
     if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
     gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
             input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
             text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
             uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
-            speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری - فعلا تاثیر محدود)", placeholder="مثال: شاد و پر انرژی", lines=2, elem_classes="rtl_override") # Clarified limited effect
             gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
             model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
-            speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override") # Default Charon
             temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
             output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
         with gr.Column(scale=1):
             gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
             max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
@@ -347,24 +354,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"),
             pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
             del_partial_lbl = f"حذف فایل‌های جزئی{pydub_warn_lbl}"
             delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
     submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
     gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
     status_output_area = gr.Textbox(label="پیام‌های وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
     with gr.Row():
         audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
         zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
     def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
     input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
     def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
     merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
     def trigger_generation_with_api_key(*args_from_ui):
         hf_secret_key = os.environ.get("GEMINI_API_KEY")
         return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
     submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
     submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
     submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)

 import zipfile
 import traceback # For detailed error logging if needed
 from google import genai
+from google.genai import types as genai_types
 try:
     from pydub import AudioSegment
     full_path = os.path.join(OUTPUT_DIR, file_name)
     try:
         with open(full_path, "wb") as f: f.write(data)
+        current_logs = log_func(f"✅ فایل: {full_path}", current_logs)
         return full_path, current_logs
     except Exception as e:
+        current_logs = log_func(f"❌ خطا ذخیره {file_name}: {e}", current_logs)
         return None, current_logs
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def load_text_from_file(file_obj, log_func, current_logs):
+    if file_obj is None: return "", log_func("❌ فایل آپلود نشد.", current_logs)
     file_path = file_obj.name
+    current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}' دریافت.", current_logs)
     try:
         with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
         current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
         return content, current_logs
+    except Exception as e: return "", log_func(f"❌ خطا خواندن فایل: {e}", current_logs)
 def smart_text_split(text, max_size=3800):
     if len(text) <= max_size: return [text]
     text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
     logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
+    for i, chunk_text in enumerate(text_chunks):
         logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
     generated_files = []
+    for i, chunk_for_api in enumerate(text_chunks):
         logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
+        # REVERTING to adding speech_prompt to the text, as per Colab's presumed successful logic
+        # Using a simple concatenation. The Colab might have had a more specific format.
+        # If speech_prompt_ui is "شاد و پر انرژی" and chunk_for_api is "سلام دنیا"
+        # final_text_for_api will be "شاد و پر انرژی\nسلام دنیا"
+        if speech_prompt_ui and speech_prompt_ui.strip():
+            final_text_for_api = f"{speech_prompt_ui.strip()}\n{chunk_for_api}"
+            logs = log_message(f"ℹ️ پرامپت سبک '{speech_prompt_ui.strip()}' به متن اضافه شد.", logs)
+        else:
+            final_text_for_api = chunk_for_api
         api_contents = [
             genai_types.Content(
                     mime_type_from_api = inline_data.mime_type
                 elif chunk_response.text:
                     log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
+                    # Check if it's an error that might indicate the prompt was misunderstood or caused an issue
+                    if "error" in chunk_response.text.lower() or "failed" in chunk_response.text.lower() or "invalid input" in chunk_response.text.lower():
+                        logs = log_message(f"❌ {log_text} (ممکن است به دلیل پرامپت سبک باشد)", logs)
                     else:
                         logs = log_message(f"ℹ️ {log_text}", logs)
                     generated_files.append(saved_file_path)
                     logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
             else:
+                if not f"❌ پیام API قطعه {i+1}" in logs:
                     logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
         except Exception as e:
             error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
+            # Check if the error message from API (if any in e.args) mentions input format or similar
+            if hasattr(e, 'args') and e.args and isinstance(e.args[0], str) and ("input" in e.args[0].lower() or "parse" in e.args[0].lower()):
+                error_msg += "\n   (ممکن است خطا به دلیل فرمت پرامپت سبک الحاق شده به متن باشد)"
             if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
             elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
             elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
          return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
     return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
+# --- Gradio UI (unchanged) ---
 css = """
 body { direction: rtl; }
 .rtl_override { direction: rtl !important; text-align: right !important; }
 .gradio-container { max-width: 800px !important; margin: auto !important; }
 """
 API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
 with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
     gr.Markdown(
         """
         <div style='text-align: center; font-family: "Arial", sans-serif;'>
             <h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
+            <p class='rtl_override'>توجه: تاثیر "پرامپت سبک گفتار" به نحوه تفسیر مدل بستگی دارد.</p>
         </div>
         """
+    )
     api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
     if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
     gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
     with gr.Row():
         with gr.Column(scale=2):
             gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
             input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
             text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
             uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
+            speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری)", placeholder="مثال: شاد و پر انرژی", lines=2, elem_classes="rtl_override")
             gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
             model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
+            speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override")
             temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
             output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
         with gr.Column(scale=1):
             gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
             max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
             pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
             del_partial_lbl = f"حذف فایل‌های جزئی{pydub_warn_lbl}"
             delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
     submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
     gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
     status_output_area = gr.Textbox(label="پیام‌های وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
     with gr.Row():
         audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
         zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
     def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
     input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
     def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
     merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
     def trigger_generation_with_api_key(*args_from_ui):
         hf_secret_key = os.environ.get("GEMINI_API_KEY")
         return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
     submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
     submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
     submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)