Update app.py
Browse files
app.py
CHANGED
@@ -8,7 +8,7 @@ import time
|
|
8 |
import zipfile
|
9 |
import traceback # For detailed error logging if needed
|
10 |
from google import genai
|
11 |
-
from google.genai import types as genai_types
|
12 |
|
13 |
try:
|
14 |
from pydub import AudioSegment
|
@@ -40,10 +40,10 @@ def save_binary_file(file_name, data, log_func, current_logs):
|
|
40 |
full_path = os.path.join(OUTPUT_DIR, file_name)
|
41 |
try:
|
42 |
with open(full_path, "wb") as f: f.write(data)
|
43 |
-
current_logs = log_func(f"✅
|
44 |
return full_path, current_logs
|
45 |
except Exception as e:
|
46 |
-
current_logs = log_func(f"❌ خطا
|
47 |
return None, current_logs
|
48 |
|
49 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
@@ -70,14 +70,14 @@ def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
|
|
70 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
71 |
|
72 |
def load_text_from_file(file_obj, log_func, current_logs):
|
73 |
-
if file_obj is None: return "", log_func("❌
|
74 |
file_path = file_obj.name
|
75 |
-
current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}'
|
76 |
try:
|
77 |
with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
|
78 |
current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
|
79 |
return content, current_logs
|
80 |
-
except Exception as e: return "", log_func(f"❌ خطا
|
81 |
|
82 |
def smart_text_split(text, max_size=3800):
|
83 |
if len(text) <= max_size: return [text]
|
@@ -168,16 +168,22 @@ def generate_audio_from_text_gradio(
|
|
168 |
|
169 |
text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
|
170 |
logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
|
171 |
-
for i, chunk_text in enumerate(text_chunks):
|
172 |
logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
|
173 |
|
174 |
generated_files = []
|
175 |
-
for i,
|
176 |
logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
|
177 |
|
178 |
-
#
|
179 |
-
#
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
api_contents = [
|
183 |
genai_types.Content(
|
@@ -219,8 +225,9 @@ def generate_audio_from_text_gradio(
|
|
219 |
mime_type_from_api = inline_data.mime_type
|
220 |
elif chunk_response.text:
|
221 |
log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
|
222 |
-
if
|
223 |
-
|
|
|
224 |
else:
|
225 |
logs = log_message(f"ℹ️ {log_text}", logs)
|
226 |
|
@@ -246,11 +253,15 @@ def generate_audio_from_text_gradio(
|
|
246 |
generated_files.append(saved_file_path)
|
247 |
logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
|
248 |
else:
|
249 |
-
if not f"❌ پیام API قطعه {i+1}" in logs:
|
250 |
logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
|
251 |
|
252 |
except Exception as e:
|
253 |
error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
|
|
|
|
|
|
|
|
|
254 |
if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
|
255 |
elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
|
256 |
elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
|
@@ -301,7 +312,7 @@ def generate_audio_from_text_gradio(
|
|
301 |
return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
|
302 |
return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
|
303 |
|
304 |
-
# --- Gradio UI (
|
305 |
css = """
|
306 |
body { direction: rtl; }
|
307 |
.rtl_override { direction: rtl !important; text-align: right !important; }
|
@@ -311,34 +322,30 @@ footer { display: none !important; }
|
|
311 |
.gradio-container { max-width: 800px !important; margin: auto !important; }
|
312 |
"""
|
313 |
API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
|
314 |
-
|
315 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
|
316 |
gr.Markdown(
|
317 |
"""
|
318 |
<div style='text-align: center; font-family: "Arial", sans-serif;'>
|
319 |
<h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
|
320 |
-
<p class='rtl_override'>توجه:
|
321 |
</div>
|
322 |
"""
|
323 |
-
)
|
324 |
api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
|
325 |
if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
|
326 |
gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
|
327 |
-
|
328 |
with gr.Row():
|
329 |
with gr.Column(scale=2):
|
330 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
|
331 |
input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
|
332 |
text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
|
333 |
uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
|
334 |
-
speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری
|
335 |
-
|
336 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
|
337 |
model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
|
338 |
-
speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override")
|
339 |
temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
|
340 |
output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
|
341 |
-
|
342 |
with gr.Column(scale=1):
|
343 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
|
344 |
max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
|
@@ -347,24 +354,19 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"),
|
|
347 |
pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
|
348 |
del_partial_lbl = f"حذف فایلهای جزئی{pydub_warn_lbl}"
|
349 |
delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
|
350 |
-
|
351 |
submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
|
352 |
gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
|
353 |
status_output_area = gr.Textbox(label="پیامهای وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
|
354 |
-
|
355 |
with gr.Row():
|
356 |
audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
|
357 |
zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
|
358 |
-
|
359 |
def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
|
360 |
input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
|
361 |
def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
|
362 |
merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
|
363 |
-
|
364 |
def trigger_generation_with_api_key(*args_from_ui):
|
365 |
hf_secret_key = os.environ.get("GEMINI_API_KEY")
|
366 |
return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
|
367 |
-
|
368 |
submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
|
369 |
submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
|
370 |
submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)
|
|
|
8 |
import zipfile
|
9 |
import traceback # For detailed error logging if needed
|
10 |
from google import genai
|
11 |
+
from google.genai import types as genai_types
|
12 |
|
13 |
try:
|
14 |
from pydub import AudioSegment
|
|
|
40 |
full_path = os.path.join(OUTPUT_DIR, file_name)
|
41 |
try:
|
42 |
with open(full_path, "wb") as f: f.write(data)
|
43 |
+
current_logs = log_func(f"✅ فایل: {full_path}", current_logs)
|
44 |
return full_path, current_logs
|
45 |
except Exception as e:
|
46 |
+
current_logs = log_func(f"❌ خطا ذخیره {file_name}: {e}", current_logs)
|
47 |
return None, current_logs
|
48 |
|
49 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
|
|
70 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
71 |
|
72 |
def load_text_from_file(file_obj, log_func, current_logs):
|
73 |
+
if file_obj is None: return "", log_func("❌ فایل آپلود نشد.", current_logs)
|
74 |
file_path = file_obj.name
|
75 |
+
current_logs = log_func(f"✅ فایل '{os.path.basename(file_path)}' دریافت.", current_logs)
|
76 |
try:
|
77 |
with open(file_path, 'r', encoding='utf-8') as f: content = f.read().strip()
|
78 |
current_logs = log_func(f"📖 متن: {len(content)} کاراکتر. نمونه: '{content[:100]}{'...' if len(content) > 100 else ''}'", current_logs)
|
79 |
return content, current_logs
|
80 |
+
except Exception as e: return "", log_func(f"❌ خطا خواندن فایل: {e}", current_logs)
|
81 |
|
82 |
def smart_text_split(text, max_size=3800):
|
83 |
if len(text) <= max_size: return [text]
|
|
|
168 |
|
169 |
text_chunks = smart_text_split(text_input_content, max_chunk_size_ui)
|
170 |
logs = log_message(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.", logs)
|
171 |
+
for i, chunk_text in enumerate(text_chunks):
|
172 |
logs = log_message(f"📝 قطعه {i+1}: {len(chunk_text)} کاراکتر", logs)
|
173 |
|
174 |
generated_files = []
|
175 |
+
for i, chunk_for_api in enumerate(text_chunks):
|
176 |
logs = log_message(f"\n🔊 تولید صدا قطعه {i+1}/{len(text_chunks)}...", logs)
|
177 |
|
178 |
+
# REVERTING to adding speech_prompt to the text, as per Colab's presumed successful logic
|
179 |
+
# Using a simple concatenation. The Colab might have had a more specific format.
|
180 |
+
# If speech_prompt_ui is "شاد و پر انرژی" and chunk_for_api is "سلام دنیا"
|
181 |
+
# final_text_for_api will be "شاد و پر انرژی\nسلام دنیا"
|
182 |
+
if speech_prompt_ui and speech_prompt_ui.strip():
|
183 |
+
final_text_for_api = f"{speech_prompt_ui.strip()}\n{chunk_for_api}"
|
184 |
+
logs = log_message(f"ℹ️ پرامپت سبک '{speech_prompt_ui.strip()}' به متن اضافه شد.", logs)
|
185 |
+
else:
|
186 |
+
final_text_for_api = chunk_for_api
|
187 |
|
188 |
api_contents = [
|
189 |
genai_types.Content(
|
|
|
225 |
mime_type_from_api = inline_data.mime_type
|
226 |
elif chunk_response.text:
|
227 |
log_text = f"💬 پیام API قطعه {i+1}: {chunk_response.text}"
|
228 |
+
# Check if it's an error that might indicate the prompt was misunderstood or caused an issue
|
229 |
+
if "error" in chunk_response.text.lower() or "failed" in chunk_response.text.lower() or "invalid input" in chunk_response.text.lower():
|
230 |
+
logs = log_message(f"❌ {log_text} (ممکن است به دلیل پرامپت سبک باشد)", logs)
|
231 |
else:
|
232 |
logs = log_message(f"ℹ️ {log_text}", logs)
|
233 |
|
|
|
253 |
generated_files.append(saved_file_path)
|
254 |
logs = log_message(f"✅ قطعه {i+1} تولید شد: {saved_file_path}", logs)
|
255 |
else:
|
256 |
+
if not f"❌ پیام API قطعه {i+1}" in logs:
|
257 |
logs = log_message(f"❌ قطعه {i+1} بدون داده صوتی.", logs)
|
258 |
|
259 |
except Exception as e:
|
260 |
error_msg = f"❌ خطا تولید قطعه {i+1}: {type(e).__name__} - {e}"
|
261 |
+
# Check if the error message from API (if any in e.args) mentions input format or similar
|
262 |
+
if hasattr(e, 'args') and e.args and isinstance(e.args[0], str) and ("input" in e.args[0].lower() or "parse" in e.args[0].lower()):
|
263 |
+
error_msg += "\n (ممکن است خطا به دلیل فرمت پرامپت سبک الحاق شده به متن باشد)"
|
264 |
+
|
265 |
if "API_KEY_INVALID" in str(e): error_msg += "\n🔑 کلید API نامعتبر."
|
266 |
elif "permission" in str(e).lower() or "403" in str(e): error_msg += f"\n🚫 عدم دسترسی به {model_name_ui}."
|
267 |
elif "429" in str(e) or "quota" in str(e).lower(): error_msg += f"\n🐢 محدودیت Quota."
|
|
|
312 |
return log_message("🛑 خروجی صوتی نیست.", logs), None, None, gr.update(visible=False)
|
313 |
return logs, final_audio_path, zip_file_path, gr.update(visible=zip_visible)
|
314 |
|
315 |
+
# --- Gradio UI (unchanged) ---
|
316 |
css = """
|
317 |
body { direction: rtl; }
|
318 |
.rtl_override { direction: rtl !important; text-align: right !important; }
|
|
|
322 |
.gradio-container { max-width: 800px !important; margin: auto !important; }
|
323 |
"""
|
324 |
API_KEY_FROM_ENV = os.environ.get("GEMINI_API_KEY")
|
|
|
325 |
with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
|
326 |
gr.Markdown(
|
327 |
"""
|
328 |
<div style='text-align: center; font-family: "Arial", sans-serif;'>
|
329 |
<h1 class='rtl_override'>تبدیل متن به گفتار با Gemini API</h1>
|
330 |
+
<p class='rtl_override'>توجه: تاثیر "پرامپت سبک گفتار" به نحوه تفسیر مدل بستگی دارد.</p>
|
331 |
</div>
|
332 |
"""
|
333 |
+
)
|
334 |
api_key_status_text = "⚠️ کلید API جمینای (GEMINI_API_KEY) در Secrets این اسپیس تنظیم نشده است."
|
335 |
if API_KEY_FROM_ENV: api_key_status_text = "✅ کلید API جمینای از Secrets بارگذاری شد."
|
336 |
gr.Markdown(f"<p style='text-align:center; color: {'green' if API_KEY_FROM_ENV else 'red'};' class='rtl_override'>{api_key_status_text}</p>")
|
|
|
337 |
with gr.Row():
|
338 |
with gr.Column(scale=2):
|
339 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات ورودی</h3>", elem_classes="rtl_override")
|
340 |
input_method_radio = gr.Radio(["ورودی متنی", "آپلود فایل"], label="روش ورودی", value="ورودی متنی", elem_classes="rtl_override")
|
341 |
text_to_speak_area = gr.Textbox(label="متن مورد نظر", placeholder="متن خود را اینجا وارد کنید...", lines=5, visible=True, elem_classes="rtl_override")
|
342 |
uploaded_file_input = gr.File(label="فایل متنی (.txt)", file_types=[".txt"], visible=False, elem_classes="rtl_override") # type: ignore
|
343 |
+
speech_prompt_area = gr.Textbox(label="پرامپت سبک گفتار (اختیاری)", placeholder="مثال: شاد و پر انرژی", lines=2, elem_classes="rtl_override")
|
|
|
344 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات مدل و خروجی</h3>", elem_classes="rtl_override")
|
345 |
model_name_dropdown = gr.Dropdown(MODELS_LIST, label="مدل", value=MODELS_LIST[0], elem_classes="rtl_override")
|
346 |
+
speaker_voice_dropdown = gr.Dropdown(SPEAKER_VOICES_LIST, label="گوینده", value="Charon", elem_classes="rtl_override")
|
347 |
temperature_slider = gr.Slider(minimum=0, maximum=2, step=0.05, value=1.0, label="دما", elem_classes="rtl_override")
|
348 |
output_filename_base_input = gr.Textbox(value="gemini_tts_output", label="نام پایه فایل خروجی", elem_classes="rtl_override")
|
|
|
349 |
with gr.Column(scale=1):
|
350 |
gr.Markdown("<h3 class='rtl_override'>تنظیمات پیشرفته</h3>", elem_classes="rtl_override")
|
351 |
max_chunk_size_slider = gr.Slider(minimum=2000, maximum=4000, step=100, value=3800, label="حداکثر کاراکتر در قطعه", elem_classes="rtl_override")
|
|
|
354 |
pydub_warn_lbl = " (pydub نیست!)" if not PYDUB_AVAILABLE else ""
|
355 |
del_partial_lbl = f"حذف فایلهای جزئی{pydub_warn_lbl}"
|
356 |
delete_partial_files_checkbox = gr.Checkbox(value=False, label=del_partial_lbl, interactive=PYDUB_AVAILABLE, elem_classes="rtl_override")
|
|
|
357 |
submit_button = gr.Button("🎤 تولید صدا", variant="primary", elem_id="submit_button_custom")
|
358 |
gr.Markdown("<h3 class='rtl_override'>خروجی</h3>", elem_classes="rtl_override")
|
359 |
status_output_area = gr.Textbox(label="پیامهای وضعیت", lines=10, interactive=False, elem_classes="rtl_override")
|
|
|
360 |
with gr.Row():
|
361 |
audio_player_output = gr.Audio(label="فایل صوتی نهایی/اولین قطعه", type="filepath", elem_classes="rtl_override") # type: ignore
|
362 |
zip_file_output = gr.File(label="دانلود همه قطعات (ZIP)", type="filepath", visible=False, elem_classes="rtl_override") # type: ignore
|
|
|
363 |
def toggle_input_method_visibility(method): return (gr.update(visible=True), gr.update(visible=False)) if method == "ورودی متنی" else (gr.update(visible=False), gr.update(visible=True))
|
364 |
input_method_radio.change(fn=toggle_input_method_visibility, inputs=input_method_radio, outputs=[text_to_speak_area, uploaded_file_input])
|
365 |
def update_delete_partials_interactive(merge_checked): return gr.update(interactive=merge_checked and PYDUB_AVAILABLE)
|
366 |
merge_audio_files_checkbox.change(fn=update_delete_partials_interactive, inputs=merge_audio_files_checkbox, outputs=delete_partial_files_checkbox)
|
|
|
367 |
def trigger_generation_with_api_key(*args_from_ui):
|
368 |
hf_secret_key = os.environ.get("GEMINI_API_KEY")
|
369 |
return generate_audio_from_text_gradio(hf_secret_key, *args_from_ui)
|
|
|
370 |
submit_inputs = [input_method_radio, text_to_speak_area, uploaded_file_input, speech_prompt_area, model_name_dropdown, speaker_voice_dropdown, temperature_slider, max_chunk_size_slider, sleep_between_requests_slider, output_filename_base_input, merge_audio_files_checkbox, delete_partial_files_checkbox]
|
371 |
submit_outputs = [status_output_area, audio_player_output, zip_file_output, zip_file_output]
|
372 |
submit_button.click(fn=trigger_generation_with_api_key, inputs=submit_inputs, outputs=submit_outputs)
|