Update app.py
Browse files
app.py
CHANGED
@@ -78,7 +78,7 @@ SPEAKER_VOICES = [
|
|
78 |
"Rasalthgeti", "Orus", "Aoede", "Callirrhoe", "Autonoe", "Enceladus",
|
79 |
"Iapetus", "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
|
80 |
]
|
81 |
-
FIXED_MODEL_NAME = "gemini-1.5-flash-preview-tts"
|
82 |
DEFAULT_MAX_CHUNK_SIZE = 3800
|
83 |
DEFAULT_SLEEP_BETWEEN_REQUESTS = 8
|
84 |
DEFAULT_OUTPUT_FILENAME_BASE = "alpha_tts_audio"
|
@@ -120,28 +120,26 @@ def parse_audio_mime_type(mime_type: str) -> dict[str, int]:
|
|
120 |
def smart_text_split(text, max_size=3800, log_list_ref=None):
|
121 |
if len(text) <= max_size: return [text]
|
122 |
chunks, current_chunk = [], ""
|
123 |
-
# Improved sentence splitting for Persian and English
|
124 |
sentences = re.split(r'(?<=[.!?؟۔])\s+', text)
|
125 |
for sentence in sentences:
|
126 |
if len(current_chunk) + len(sentence) + 1 > max_size:
|
127 |
if current_chunk: chunks.append(current_chunk.strip())
|
128 |
current_chunk = sentence
|
129 |
-
while len(current_chunk) > max_size:
|
130 |
-
# Try to split at common punctuation or spaces, working backwards
|
131 |
split_idx = -1
|
132 |
-
for punc in ['،', ',', ';', ':', ' ']:
|
133 |
idx = current_chunk.rfind(punc, max_size // 2, max_size)
|
134 |
if idx > split_idx : split_idx = idx
|
135 |
|
136 |
if split_idx != -1:
|
137 |
part, current_chunk = current_chunk[:split_idx+1], current_chunk[split_idx+1:]
|
138 |
-
else:
|
139 |
part, current_chunk = current_chunk[:max_size], current_chunk[max_size:]
|
140 |
chunks.append(part.strip())
|
141 |
else:
|
142 |
-
current_chunk += (" " if current_chunk and sentence else "") + sentence
|
143 |
if current_chunk: chunks.append(current_chunk.strip())
|
144 |
-
final_chunks = [c for c in chunks if c]
|
145 |
if log_list_ref: _log_tts(f"📊 متن به {len(final_chunks)} قطعه تقسیم شد.", log_list_ref)
|
146 |
return final_chunks
|
147 |
|
@@ -157,12 +155,12 @@ def merge_audio_files_func(file_paths, output_path, log_list_ref):
|
|
157 |
if os.path.exists(fp):
|
158 |
segment = AudioSegment.from_file(fp)
|
159 |
combined += segment
|
160 |
-
if i < len(file_paths) - 1:
|
161 |
-
combined += AudioSegment.silent(duration=150)
|
162 |
else:
|
163 |
_log_tts(f"⚠️ فایل صوتی برای ادغام یافت نشد: {fp}", log_list_ref)
|
164 |
|
165 |
-
combined.export(output_path, format="wav")
|
166 |
_log_tts(f"✅ فایل صوتی با موفقیت در '{output_path}' ادغام و ذخیره شد.", log_list_ref)
|
167 |
return True
|
168 |
except Exception as e:
|
@@ -181,50 +179,7 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
181 |
return None, "خطا: کلید API جیمینای برای سرویس TTS در دسترس نیست."
|
182 |
|
183 |
try:
|
184 |
-
# Configure genai with the specific API key for this operation
|
185 |
-
# Note: genai.configure is global. If running concurrent operations with different keys,
|
186 |
-
# genai.Client(api_key=api_key) is safer. Let's assume genai.Client for TTS.
|
187 |
-
client = genai.GenerativeModel(model_name=FIXED_MODEL_NAME, api_key=api_key) # Old way
|
188 |
-
# For specific model like TTS, often it's through client.models or genai.get_model
|
189 |
-
# The second script used: client = genai.Client(api_key=api_key)
|
190 |
-
# And then client.models.generate_content(model=FIXED_MODEL_NAME, ...)
|
191 |
-
# Let's stick to genai.Client for TTS as it's more direct for such models.
|
192 |
-
|
193 |
-
# Re-instantiate client with the specific key (safer than global configure if other parts of app use genai)
|
194 |
-
# However, the get_gemini_api_key_sync rotates a global index, so global configure is implied.
|
195 |
-
# For simplicity with provided key rotation:
|
196 |
-
current_genai_client = genai.get_model(f"models/{FIXED_MODEL_NAME}") # Simpler if model name is just 'tts-model'
|
197 |
-
# But FIXED_MODEL_NAME is "gemini-1.5-flash-preview-tts"
|
198 |
-
# This might require `genai.configure(api_key=api_key)` first.
|
199 |
-
# Let's use the direct method from AlphaTTS script for robustneess.
|
200 |
-
genai.configure(api_key=api_key) # Configure with the rotated key
|
201 |
-
# model_instance = genai.GenerativeModel(FIXED_MODEL_NAME) # This is usually for text/chat models
|
202 |
-
# For TTS, the AlphaTTS code used `client.models.generate_content`
|
203 |
-
# which implies `genai.Client(api_key=...)` then `client.models.generate_content(...)`
|
204 |
-
# Or if `genai.configure` is used, then `genai.generate_text(...)` or similar global funcs.
|
205 |
-
# Let's assume `genai.configure` is enough and then use a top-level function if available,
|
206 |
-
# or stick to client.
|
207 |
-
|
208 |
-
# Given the AlphaTTS structure:
|
209 |
-
# client = genai.Client(api_key=api_key) # This is the most direct way if Client takes api_key
|
210 |
-
# Let's assume genai.configure is what's intended with the key rotation logic.
|
211 |
-
# genai.configure(api_key=api_key) # Already done by key rotation
|
212 |
-
# This is a bit messy. Let's refine `get_gemini_api_key_sync` to also call `genai.configure`.
|
213 |
-
# No, `get_gemini_api_key_sync` should just return the key. The caller configures.
|
214 |
-
|
215 |
-
# Safest approach: configure genai globally for this operation
|
216 |
genai.configure(api_key=api_key)
|
217 |
-
# Then use a model instance. The second script used client.models.generate_content
|
218 |
-
# which is not directly available on GenerativeModel typically.
|
219 |
-
# Let's try to use the structure from Alpha TTS as much as possible.
|
220 |
-
# It used `client.models.generate_content`.
|
221 |
-
# This means we might need to instantiate `genai.Client` instead of `genai.GenerativeModel`.
|
222 |
-
# However, `genai.Client` is usually for the full API surface.
|
223 |
-
# Let's try with `genai.GenerativeModel` and see if it supports speech config.
|
224 |
-
# If not, we'll need to use `genai.generate_content` with the full model path.
|
225 |
-
|
226 |
-
model_to_use_direct = f"models/{FIXED_MODEL_NAME}" # e.g., "models/gemini-1.5-flash-preview-tts"
|
227 |
-
|
228 |
except Exception as e:
|
229 |
_log_tts(f"❌ خطا در مقداردهی اولیه کلاینت Gemini: {e}", log_list_ref)
|
230 |
return None, f"خطا در ارتباط با Gemini: {e}"
|
@@ -242,195 +197,74 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
242 |
for i, chunk in enumerate(text_chunks):
|
243 |
_log_tts(f"🔊 پردازش قطعه {i+1}/{len(text_chunks)}...", log_list_ref)
|
244 |
|
245 |
-
# Constructing the request content based on AlphaTTS structure
|
246 |
final_text_for_tts = f'"{prompt_input}"\n{chunk}' if prompt_input and prompt_input.strip() else chunk
|
247 |
|
248 |
-
# This part needs to align with how Gemini TTS API expects requests via the Python SDK
|
249 |
-
# AlphaTTS used:
|
250 |
-
# contents = [genai_types.Content(role="user", parts=[genai_types.Part.from_text(text=final_text)])]
|
251 |
-
# config = genai_types.GenerateContentConfig(temperature=temperature_val, response_modalities=["audio"],
|
252 |
-
# speech_config=genai_types.SpeechConfig(voice_config=genai_types.VoiceConfig(
|
253 |
-
# prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(voice_name=selected_voice))))
|
254 |
-
# response = client.models.generate_content(model=FIXED_MODEL_NAME, contents=contents, config=config)
|
255 |
-
|
256 |
-
# Using global `generate_content` after `genai.configure(api_key=...)`
|
257 |
try:
|
258 |
-
|
259 |
-
|
260 |
-
|
261 |
-
#
|
262 |
-
|
263 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
264 |
|
265 |
-
|
266 |
-
# The `client.models.generate_content` from AlphaTTS is more specific.
|
267 |
-
# `genai.GenerativeModel(model_name).generate_content` is the current standard.
|
268 |
|
269 |
-
|
270 |
-
|
271 |
-
|
272 |
-
|
273 |
-
|
274 |
-
|
275 |
-
# This needs verification against current Gemini SDK for TTS.
|
276 |
-
# The `response_modalities=["audio"]` and `speech_config` were used with `client.models.generate_content`.
|
277 |
-
# If `GenerativeModel` is used, the request structure might be different.
|
278 |
-
# The `generate_content` method of `GenerativeModel` takes `request_options` for things like `response_mime_type`.
|
279 |
-
|
280 |
-
# Let's revert to the structure most likely to work based on AlphaTTS's use of `client.models.generate_content`
|
281 |
-
# This means we may need to use `genai.Client(api_key=api_key).models.get(FIXED_MODEL_NAME).generate_content(...)`
|
282 |
-
# OR `genai.generate_content(model=f"models/{FIXED_MODEL_NAME}", contents=..., generation_config=..., speech_config=...)` if that signature exists.
|
283 |
-
|
284 |
-
# Simplest path if `genai.configure` is used and there's a global way:
|
285 |
-
# This is a common pattern for `GenerateContentRequest`
|
286 |
-
gc_request = genai_types.GenerateContentRequest(
|
287 |
-
model=f"models/{FIXED_MODEL_NAME}", # Ensure "models/" prefix if needed
|
288 |
-
contents=request_contents,
|
289 |
generation_config=genai_types.GenerationConfig(
|
290 |
temperature=temperature_val,
|
291 |
-
response_mime_type="audio/wav"
|
292 |
),
|
293 |
-
# How to pass voice and prompt? This is SDK specific.
|
294 |
-
# Re-checking AlphaTTS: `speech_config` was part of `GenerateContentConfig` passed to `client.models.generate_content`
|
295 |
-
# This is non-standard for `genai.GenerationConfig`.
|
296 |
-
# It seems `genai.Client().model().generate_content()` has a different `config` param.
|
297 |
)
|
298 |
-
|
299 |
-
# Let's use the exact structure from AlphaTTS for `config` as it was working there.
|
300 |
-
# This implies that `genai.generate_content` (global) or `GenerativeModel.generate_content`
|
301 |
-
# must accept a similar config object if `client.models.generate_content` is not used.
|
302 |
-
|
303 |
-
custom_config_for_tts = genai_types.GenerationConfig( # This seems to be the new way
|
304 |
-
temperature=temperature_val,
|
305 |
-
# response_modalities=["audio"], # This might be implicit or handled by response_mime_type
|
306 |
-
# The following was from AlphaTTS, might need to be adapted or is for older/different client path
|
307 |
-
# speech_config=genai_types.SpeechConfig(
|
308 |
-
# voice_config=genai_types.VoiceConfig(
|
309 |
-
# prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(voice_name=selected_voice)
|
310 |
-
# )
|
311 |
-
# )
|
312 |
-
# For new Gemini 1.5 Flash/Pro with native audio output, it's often simpler:
|
313 |
-
# You might pass voice parameters differently, or the model implicitly handles it.
|
314 |
-
# The `FIXED_MODEL_NAME` "gemini-1.5-flash-preview-tts" suggests it IS a TTS model.
|
315 |
-
# Let's assume for now the model name and text input are enough, and voice/prompt are part of the text or model behavior.
|
316 |
-
# If `selected_voice` and `prompt_input` are crucial, they need to be part of the request.
|
317 |
-
# `prompt_input` can be part of `final_text_for_tts`.
|
318 |
-
# `selected_voice` needs a parameter in the API call.
|
319 |
-
# The Gemini API for TTS usually involves specifying the voice in the request.
|
320 |
-
# e.g. in `synthesis_input` or `voice_selection_params`
|
321 |
-
|
322 |
-
# The most up-to-date way for Gemini 1.5 Flash TTS might involve `tools` or specific TTS request structures.
|
323 |
-
# Given the AlphaTTS code, it used `speech_config`. Let's try to replicate.
|
324 |
-
# `genai.GenerativeModel.generate_content` has `generation_config` and `safety_settings`.
|
325 |
-
# The `speech_config` is not standard there.
|
326 |
-
|
327 |
-
# If `FIXED_MODEL_NAME` is a true generative model that can output audio,
|
328 |
-
# the prompt needs to guide it.
|
329 |
-
# "Generate an audio of the following text with voice {selected_voice}: {chunk}"
|
330 |
-
# This is less likely for specialized TTS models.
|
331 |
-
|
332 |
-
# Fallback to a more direct call if available for TTS, or ensure `GenerativeModel` is configured correctly.
|
333 |
-
# The most robust way is to use the specific SDK features for TTS.
|
334 |
-
# If `genai.get_model("models/text-to-speech")` exists:
|
335 |
-
# tts_service_model = genai.get_model("models/text-to-speech")
|
336 |
-
# response = tts_service_model.synthesize_speech(text=final_text_for_tts, voice=selected_voice, ...)
|
337 |
-
# This is typical for dedicated TTS.
|
338 |
-
# Since AlphaTTS used `gemini-X.Y-flash-preview-tts`, it's likely a multimodal model.
|
339 |
-
|
340 |
-
# Replicating the structure that AlphaTTS `client.models.generate_content` used:
|
341 |
-
# This is the most likely path to success given it worked in AlphaTTS.
|
342 |
-
# We need to call a similar function. `genai.GenerativeModel(FIXED_MODEL_NAME)` is the modern way.
|
343 |
-
tts_model = genai.GenerativeModel(FIXED_MODEL_NAME) # api_key is globally configured
|
344 |
-
|
345 |
-
# Constructing the specific configuration for TTS with GenerativeModel.
|
346 |
-
# This is where `selected_voice` and `prompt_input` (as system instruction or context) matter.
|
347 |
-
# The Gemini API documentation for multimodal models with audio output is key here.
|
348 |
-
# Often, it's done by specifying `response_mime_type='audio/wav'` in `generation_config`.
|
349 |
-
# The voice selection might be a parameter in `GenerationConfig` or part of the prompt for some models.
|
350 |
-
|
351 |
-
# Let's assume `selected_voice` can be part of the text prompt for now if not a direct API param.
|
352 |
-
# And `prompt_input` is part of the context.
|
353 |
-
|
354 |
-
# Simpler request, assuming model handles voice from name or general quality from prompt:
|
355 |
-
# The `prompt_input` from AlphaTTS was more like a system instruction for tone.
|
356 |
-
# The `selected_voice` was a specific voice model name.
|
357 |
-
|
358 |
-
# The crucial part from AlphaTTS was:
|
359 |
-
# speech_config = genai_types.SpeechConfig(
|
360 |
-
# voice_config=genai_types.VoiceConfig(
|
361 |
-
# prebuilt_voice_config=genai_types.PrebuiltVoiceConfig(voice_name=selected_voice)))
|
362 |
-
# And this `speech_config` was passed into a `GenerateContentConfig` like object.
|
363 |
-
# Let's try to pass this to `tts_model.generate_content`.
|
364 |
-
|
365 |
-
# The new SDK might use `ToolConfig` for speech synthesis or `Content` with specific parts.
|
366 |
-
# For direct audio generation, it's often simpler:
|
367 |
-
response = tts_model.generate_content(
|
368 |
-
final_text_for_tts, # The text to synthesize
|
369 |
-
generation_config=genai_types.GenerationConfig(
|
370 |
-
temperature=temperature_val,
|
371 |
-
# Candidate count, stop sequences, etc.
|
372 |
-
# For audio output, you specify the desired mime type:
|
373 |
-
response_mime_type="audio/wav" # Or "audio/mp3" if supported
|
374 |
-
),
|
375 |
-
# How to specify voice like 'Achird', 'Zephyr'?
|
376 |
-
# This is the missing link if `selected_voice` is not part of the model name itself.
|
377 |
-
# If `FIXED_MODEL_NAME` implies a voice, or if it's a general TTS synthesizer,
|
378 |
-
# voice selection must be in the request.
|
379 |
-
# The `speech_config` from AlphaTTS is the best hint.
|
380 |
-
# It might be that `genai.GenerativeModel` does not support this `speech_config`.
|
381 |
-
# And `genai.Client(api_key=...).models.get(model_name).generate_content(...)` was the correct path.
|
382 |
-
|
383 |
-
# Let's assume `request_options` might hold it, or a `Tool` for TTS.
|
384 |
-
# Given `SPEAKER_VOICES`, it implies a selection mechanism.
|
385 |
-
# If `genai_types.SpeechConfig` exists, it's meant to be used.
|
386 |
-
# Perhaps `GenerativeModel.generate_content(contents=..., generation_config=..., tools=...)`
|
387 |
-
# where one tool is configured for speech synthesis with the voice.
|
388 |
-
|
389 |
-
# Sticking to the most direct interpretation of AlphaTTS's success:
|
390 |
-
# It used a `config` object containing `speech_config`.
|
391 |
-
# `genai.GenerativeModel.generate_content`'s `generation_config` is standard.
|
392 |
-
# The `client.models.generate_content` in AlphaTTS might be an older or different SDK path.
|
393 |
-
|
394 |
-
# Let's try with the most direct `GenerativeModel` call, assuming voice is part of the prompt or fixed for the model.
|
395 |
-
# If `selected_voice` is critical, this will fail to use it.
|
396 |
-
# The `prompt_input` (e.g., "با لحنی شاد") can be prepended to `final_text_for_tts`.
|
397 |
-
# text_with_prompt_and_voice_hint = f"{prompt_input}. Voice: {selected_voice}. Text: {chunk}"
|
398 |
-
# This is a guess if dedicated voice parameters are not obvious.
|
399 |
-
)
|
400 |
|
401 |
fname_base = f"{output_base_name}_part{i+1:03d}"
|
402 |
|
403 |
-
|
404 |
-
|
405 |
-
|
406 |
-
|
407 |
-
|
408 |
-
|
409 |
-
audio_bytes = response.parts[0].audio_data # This is a guess
|
410 |
-
mime_type = "audio/wav" # Assuming we requested WAV
|
411 |
elif response.candidates and response.candidates[0].content.parts and response.candidates[0].content.parts[0].inline_data: # AlphaTTS way
|
412 |
inline_data = response.candidates[0].content.parts[0].inline_data
|
413 |
audio_bytes = inline_data.data
|
414 |
mime_type = inline_data.mime_type
|
415 |
-
else:
|
416 |
audio_part = None
|
417 |
-
|
418 |
-
|
419 |
-
|
420 |
-
|
421 |
-
|
|
|
422 |
audio_bytes = audio_part.data
|
423 |
mime_type = audio_part.mime_type
|
424 |
-
elif audio_part and hasattr(audio_part, '_blob'):
|
425 |
audio_bytes = audio_part._blob.data
|
426 |
mime_type = audio_part._blob.mime_type
|
427 |
-
|
|
|
428 |
_log_tts(f"⚠️ پاسخ API برای قطعه {i+1} بدون داده صوتی معتبر دریافت شد.", log_list_ref)
|
429 |
-
_log_tts(f"Response structure: {response}", log_list_ref)
|
430 |
continue
|
431 |
|
432 |
-
|
433 |
-
|
|
|
|
|
|
|
|
|
|
|
434 |
audio_bytes = convert_to_wav(audio_bytes, mime_type)
|
435 |
if not ext.startswith("."): ext = "." + ext
|
436 |
|
@@ -440,21 +274,9 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
440 |
|
441 |
except Exception as e:
|
442 |
_log_tts(f"❌ خطا در تولید قطعه صوتی {i+1} با Gemini: {e}\n{traceback.format_exc()}", log_list_ref)
|
443 |
-
# Try to get more detailed error from Gemini if available
|
444 |
if hasattr(e, 'response') and e.response:
|
445 |
-
_log_tts(f"Gemini API
|
446 |
-
|
447 |
-
error_message_for_ui = f"خطا در تولید صدا ({type(e).__name__})."
|
448 |
-
if NUM_GEMINI_KEYS > 1:
|
449 |
-
error_message_for_ui += " کلید بعدی امتحان خواهد شد اگر قطعات دیگری وجود داشته باشد." # Not quite, key rotates per call not per chunk failure.
|
450 |
-
# For now, a single chunk failure might stop the whole process if we don't continue.
|
451 |
-
# The loop `continue`s, so other chunks will be tried.
|
452 |
-
# The error message for the UI should be a summary at the end.
|
453 |
-
# This needs to be handled in `gradio_tts_interface`.
|
454 |
-
# Let's have `core_generate_audio` return (None, error_message) on first critical failure.
|
455 |
-
# No, it should try all chunks and return what it could.
|
456 |
-
# The `gradio_tts_interface` will decide the final message.
|
457 |
-
continue # Try next chunk
|
458 |
|
459 |
if i < len(text_chunks) - 1 and len(text_chunks) > 1:
|
460 |
_log_tts(f"💤 توقف کوتاه ({sleep_time} ثانیه) قبل از پردازش قطعه بعدی...", log_list_ref)
|
@@ -467,23 +289,22 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
467 |
_log_tts(f"🎉 {len(generated_files)} فایل(های) صوتی با موفقیت تولید شد.", log_list_ref)
|
468 |
|
469 |
final_audio_file = None
|
470 |
-
final_output_path_base = f"{output_base_name}_final"
|
471 |
|
472 |
if len(generated_files) > 1:
|
473 |
if PYDUB_AVAILABLE:
|
474 |
-
merged_fn = f"{final_output_path_base}.wav"
|
475 |
if os.path.exists(merged_fn):
|
476 |
try: os.remove(merged_fn)
|
477 |
except OSError as e: _log_tts(f"⚠️ عدم امکان حذف فایل ادغام شده قبلی '{merged_fn}': {e}", log_list_ref)
|
478 |
|
479 |
if merge_audio_files_func(generated_files, merged_fn, log_list_ref):
|
480 |
final_audio_file = merged_fn
|
481 |
-
# Clean up individual parts
|
482 |
for fp in generated_files:
|
483 |
if os.path.abspath(fp) != os.path.abspath(merged_fn):
|
484 |
try: os.remove(fp)
|
485 |
except OSError as e_del: _log_tts(f"⚠️ عدم امکان حذف فایل موقت '{fp}': {e_del}", log_list_ref)
|
486 |
-
else:
|
487 |
_log_tts("⚠️ ادغام فایلهای صوتی ناموفق بود. اولین قطعه ارائه میشود.", log_list_ref)
|
488 |
if generated_files:
|
489 |
try:
|
@@ -493,16 +314,14 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
493 |
if os.path.exists(fallback_fn): os.remove(fallback_fn)
|
494 |
os.rename(first_chunk_path, fallback_fn)
|
495 |
final_audio_file = fallback_fn
|
496 |
-
# Clean up other parts
|
497 |
for i_gf in range(1, len(generated_files)):
|
498 |
try: os.remove(generated_files[i_gf])
|
499 |
except OSError as e_del: _log_tts(f"⚠️ عدم امکان حذف فایل موقت '{generated_files[i_gf]}': {e_del}", log_list_ref)
|
500 |
except Exception as e_rename_fallback:
|
501 |
_log_tts(f"خطا در تغییر نام فایل اولین قطعه (fallback): {e_rename_fallback}", log_list_ref)
|
502 |
-
final_audio_file = generated_files[0]
|
503 |
-
else:
|
504 |
_log_tts("⚠️ Pydub برای ادغام در دسترس نیست. اولین قطعه صوتی ارائه میشود.", log_list_ref)
|
505 |
-
# Similar renaming logic for the first part as above for consistency
|
506 |
if generated_files:
|
507 |
try:
|
508 |
first_chunk_path = generated_files[0]
|
@@ -511,9 +330,6 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
511 |
if os.path.exists(single_fallback_fn): os.remove(single_fallback_fn)
|
512 |
os.rename(first_chunk_path, single_fallback_fn)
|
513 |
final_audio_file = single_fallback_fn
|
514 |
-
# Clean up other parts (optional, user might want them if no merge)
|
515 |
-
# For simplicity, let's not clean them up if pydub is missing, they might be useful.
|
516 |
-
# Or, offer a ZIP. For now, just the first.
|
517 |
for i_gf in range(1, len(generated_files)):
|
518 |
_log_tts(f"قطعه اضافی موجود: {generated_files[i_gf]} (ادغام نشده)", log_list_ref)
|
519 |
|
@@ -535,19 +351,16 @@ def core_generate_audio(text_input, prompt_input, selected_voice, temperature_va
|
|
535 |
final_audio_file = final_single_fn
|
536 |
except Exception as e_rename_single:
|
537 |
_log_tts(f"خطا در تغییر نام فایل تکی نهایی: {e_rename_single}", log_list_ref)
|
538 |
-
final_audio_file = generated_files[0]
|
539 |
|
540 |
if final_audio_file and not os.path.exists(final_audio_file):
|
541 |
_log_tts(f"⚠️ فایل صوتی نهایی '{final_audio_file}' پس ��ز پردازش وجود ندارد!", log_list_ref)
|
542 |
return None, "خطا: فایل صوتی نهایی یافت نشد."
|
543 |
|
544 |
-
return final_audio_file, "موفق"
|
545 |
|
546 |
def gradio_tts_interface(use_file_input, uploaded_file, text_to_speak, speech_prompt, speaker_voice, temperature):
|
547 |
-
|
548 |
-
# progress=gr.Progress(track_tqdm=True)
|
549 |
-
|
550 |
-
logs_for_this_run = [] # For this specific run, to potentially show user or debug
|
551 |
actual_text = ""
|
552 |
status_message = "شروع پردازش..."
|
553 |
final_audio_path = None
|
@@ -577,7 +390,6 @@ def gradio_tts_interface(use_file_input, uploaded_file, text_to_speak, speech_pr
|
|
577 |
_log_tts(f"تنظیمات: Speaker={speaker_voice}, Temp={temperature}, Prompt='{speech_prompt[:30]}...'", logs_for_this_run)
|
578 |
|
579 |
try:
|
580 |
-
# Call the core audio generation function
|
581 |
final_audio_path, generation_status_msg = core_generate_audio(
|
582 |
actual_text, speech_prompt, speaker_voice, temperature, logs_for_this_run
|
583 |
)
|
@@ -586,11 +398,11 @@ def gradio_tts_interface(use_file_input, uploaded_file, text_to_speak, speech_pr
|
|
586 |
status_message = "✅ تبدیل متن به گفتار با موفقیت انجام شد."
|
587 |
_log_tts(status_message, logs_for_this_run)
|
588 |
return final_audio_path, status_message
|
589 |
-
elif final_audio_path and generation_status_msg != "موفق":
|
590 |
status_message = f"⚠️ {generation_status_msg}. فایل صوتی ممکن است ناقص باشد: {final_audio_path}"
|
591 |
_log_tts(status_message, logs_for_this_run)
|
592 |
return final_audio_path, status_message
|
593 |
-
else:
|
594 |
status_message = f"❌ {generation_status_msg}"
|
595 |
_log_tts(status_message, logs_for_this_run)
|
596 |
return None, status_message
|
@@ -605,22 +417,21 @@ def gradio_tts_interface(use_file_input, uploaded_file, text_to_speak, speech_pr
|
|
605 |
# --- START: بخش UI و Gradio (Adapted from Alpha Translator, content from Alpha TTS) ---
|
606 |
FLY_PRIMARY_COLOR_HEX = "#4F46E5"
|
607 |
FLY_SECONDARY_COLOR_HEX = "#10B981"
|
608 |
-
FLY_ACCENT_COLOR_HEX = "#D97706"
|
609 |
FLY_TEXT_COLOR_HEX = "#1F2937"
|
610 |
FLY_SUBTLE_TEXT_HEX = "#6B7280"
|
611 |
-
FLY_LIGHT_BACKGROUND_HEX = "#F9FAFB"
|
612 |
FLY_WHITE_HEX = "#FFFFFF"
|
613 |
FLY_BORDER_COLOR_HEX = "#D1D5DB"
|
614 |
FLY_INPUT_BG_HEX_SIMPLE = "#F3F4F6"
|
615 |
-
FLY_PANEL_BG_SIMPLE = "#E0F2FE"
|
616 |
|
617 |
app_theme_outer = gr.themes.Base(
|
618 |
-
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
|
619 |
).set(
|
620 |
-
body_background_fill=FLY_LIGHT_BACKGROUND_HEX,
|
621 |
)
|
622 |
|
623 |
-
# Using CSS from Alpha Translator (first code)
|
624 |
custom_css = f"""
|
625 |
@import url('https://fonts.googleapis.com/css2?family=Vazirmatn:wght@300;400;500;600;700;800&display=swap');
|
626 |
@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;600;700;800&display=swap');
|
@@ -632,7 +443,7 @@ custom_css = f"""
|
|
632 |
--fly-bg-white: {FLY_WHITE_HEX}; --fly-border-color: {FLY_BORDER_COLOR_HEX};
|
633 |
--fly-input-bg-simple: {FLY_INPUT_BG_HEX_SIMPLE}; --fly-panel-bg-simple: {FLY_PANEL_BG_SIMPLE};
|
634 |
--font-global: 'Vazirmatn', 'Inter', 'Poppins', system-ui, sans-serif;
|
635 |
-
--font-english: 'Poppins', 'Inter', system-ui, sans-serif;
|
636 |
--radius-sm: 0.375rem; --radius-md: 0.5rem; --radius-lg: 0.75rem; --radius-xl: 1rem; --radius-full: 9999px;
|
637 |
--shadow-sm: 0 1px 2px 0 rgba(0,0,0,0.05); --shadow-md: 0 4px 6px -1px rgba(0,0,0,0.1),0 2px 4px -2px rgba(0,0,0,0.1);
|
638 |
--shadow-lg: 0 10px 15px -3px rgba(0,0,0,0.1),0 4px 6px -4px rgba(0,0,0,0.1);
|
@@ -649,38 +460,29 @@ body {{font-family:var(--font-global);direction:rtl;background-color:var(--fly-b
|
|
649 |
footer,.gradio-footer,.flagging-container,.flex.row.gap-2.absolute.bottom-2.right-2.gr-compact.gr-box.gr-text-gray-500,div[data-testid="flag"],button[title="Flag"],button[aria-label="Flag"],.footer-utils {{display:none !important;visibility:hidden !important;}}
|
650 |
.main-content-area {{flex-grow:1;padding:0.75rem;width:100%;margin:0 auto;box-sizing:border-box;}}
|
651 |
.content-panel-simple {{background-color:var(--fly-bg-white);padding:1rem;border-radius:var(--radius-xl);box-shadow:var(--shadow-xl);margin-top:-2rem;position:relative;z-index:10;margin-bottom:2rem;width:100%;box-sizing:border-box;}}
|
652 |
-
/* Main button styling from Alpha Translator */
|
653 |
.content-panel-simple .gr-button.lg.primary,.content-panel-simple button[variant="primary"] {{background:var(--fly-accent) !important;margin-top:1rem !important;padding:12px 20px !important;transition:all 0.25s ease-in-out !important;color:white !important;font-weight:600 !important;border-radius:10px !important;border:none !important;box-shadow:0 3px 8px -1px rgba(var(--fly-accent-rgb),0.3) !important;width:100% !important;font-size:1em !important;display:flex;align-items:center;justify-content:center;}}
|
654 |
.content-panel-simple .gr-button.lg.primary:hover,.content-panel-simple button[variant="primary"]:hover {{background:#B45309 !important;transform:translateY(-1px) !important;box-shadow:0 5px 10px -1px rgba(var(--fly-accent-rgb),0.4) !important;}}
|
655 |
-
/* Input styling from Alpha Translator */
|
656 |
.content-panel-simple .gr-input > label + div > textarea,.content-panel-simple .gr-dropdown > label + div > div > input,.content-panel-simple .gr-dropdown > label + div > div > select,.content-panel-simple .gr-textbox > label + div > textarea, .content-panel-simple .gr-file > label + div {{border-radius:8px !important;border:1.5px solid var(--fly-border-color) !important;font-size:0.95em !important;background-color:var(--fly-input-bg-simple) !important;padding:10px 12px !important;color:var(--fly-text-primary) !important;}}
|
657 |
.content-panel-simple .gr-input > label + div > textarea:focus,.content-panel-simple .gr-dropdown > label + div > div > input:focus,.content-panel-simple .gr-dropdown > label + div > div > select:focus,.content-panel-simple .gr-textbox > label + div > textarea:focus, .content-panel-simple .gr-file > label + div:focus-within {{border-color:var(--fly-primary) !important;box-shadow:0 0 0 3px rgba(var(--fly-primary-rgb),0.12) !important;background-color:var(--fly-bg-white) !important;}}
|
658 |
-
.content-panel-simple .gr-file > label + div {{ text-align:center; border-style: dashed !important; }}
|
659 |
.content-panel-simple .gr-dropdown select {{font-family:var(--font-global) !important;width:100%;cursor:pointer;}}
|
660 |
-
/* Output text area styling (if we add one for status messages) */
|
661 |
.content-panel-simple .gr-textbox[label*="وضعیت"] > label + div > textarea {{background-color:var(--fly-panel-bg-simple) !important;border-color:#A5D5FE !important;min-height:80px;font-family:var(--font-global);font-size:0.9em !important;line-height:1.5;padding:10px !important;}}
|
662 |
-
/* Panel and Accordion styling (not using accordion here, but good to have) */
|
663 |
.content-panel-simple .gr-panel,.content-panel-simple div[label*="تنظیمات پیشرفته"] > .gr-accordion > .gr-panel {{border-radius:8px !important;border:1px solid var(--fly-border-color) !important;background-color:var(--fly-input-bg-simple) !important;padding:0.8rem 1rem !important;margin-top:0.6rem;box-shadow:none;}}
|
664 |
.content-panel-simple div[label*="تنظیمات پیشرفته"] > .gr-accordion > button.gr-button {{font-weight:500 !important;padding:8px 10px !important;border-radius:6px !important;background-color:#E5E7EB !important;color:var(--fly-text-primary) !important;border:1px solid #D1D5DB !important;}}
|
665 |
-
/* Label styling */
|
666 |
.content-panel-simple label > span.label-text {{font-weight:500 !important;color:#4B5563 !important;font-size:0.88em !important;margin-bottom:6px !important;display:inline-block;}}
|
667 |
-
/* Slider label styling */
|
668 |
.content-panel-simple .gr-slider label span {{font-size:0.82em !important;color:var(--fly-text-secondary);}}
|
669 |
-
/* Temperature description class from AlphaTTS, adapted */
|
670 |
.temp-description-tts {{ font-size: 0.82em !important; color: var(--fly-text-secondary) !important; margin-top: -0.5rem; margin-bottom: 1rem; padding-right: 5px; }}
|
671 |
-
/* Examples styling from Alpha Translator */
|
672 |
.content-panel-simple div[label*="نمونه"] {{margin-top:1.5rem;}}
|
673 |
.content-panel-simple div[label*="نمونه"] .gr-button.gr-button-tool,.content-panel-simple div[label*="نمونه"] .gr-sample-button {{background-color:#E0E7FF !important;color:var(--fly-primary) !important;border-radius:6px !important;font-size:0.78em !important;padding:4px 8px !important;}}
|
674 |
.content-panel-simple .custom-hr {{height:1px;background-color:var(--fly-border-color);margin:1.5rem 0;border:none;}}
|
675 |
.api-warning-message {{background-color:#FFFBEB !important;color:#92400E !important;padding:10px 12px !important;border-radius:8px !important;border:1px solid #FDE68A !important;text-align:center !important;margin:0 0.2rem 1rem 0.2rem !important;font-size:0.85em !important;}}
|
676 |
-
/* Audio player styling */
|
677 |
.content-panel-simple #output_audio_tts audio {{ width: 100%; border-radius: var(--radius-md); margin-top:0.5rem; }}
|
678 |
@media (min-width:640px) {{.main-content-area {{padding:1.5rem;max-width:700px;}} .content-panel-simple {{padding:1.5rem;}} .app-title-card h1 {{font-size:2.5em !important;}} .app-title-card p {{font-size:1.05em !important;}} }}
|
679 |
@media (min-width:768px) {{
|
680 |
.main-content-area {{max-width:780px;}} .content-panel-simple {{padding:2rem;}}
|
681 |
.content-panel-simple .main-content-row {{display:flex !important;flex-direction:row !important;gap:1.5rem !important;}}
|
682 |
-
.content-panel-simple .main-content-row > .gr-column:nth-child(1) {{flex-basis:60%; min-width:0;}}
|
683 |
-
.content-panel-simple .main-content-row > .gr-column:nth-child(2) {{flex-basis:40%; min-width:0;}}
|
684 |
.content-panel-simple .gr-button.lg.primary,.content-panel-simple button[variant="primary"] {{width:auto !important;align-self:flex-start;}}
|
685 |
.app-title-card h1 {{font-size:2.75em !important;}} .app-title-card p {{font-size:1.1em !important;}}
|
686 |
}}
|
@@ -711,11 +513,10 @@ with gr.Blocks(theme=app_theme_outer, css=custom_css, title="آلفا TTS") as d
|
|
711 |
status_message_output = gr.Textbox(label="وضعیت پردازش", interactive=False, lines=1, placeholder="پیامهای وضعیت اینجا نمایش داده میشوند...")
|
712 |
|
713 |
with gr.Row(elem_classes=["main-content-row"]):
|
714 |
-
with gr.Column(scale=3):
|
715 |
use_file_input_cb = gr.Checkbox(label="📄 استفاده از فایل متنی (.txt)", value=False)
|
716 |
-
# Label for file input is handled by checkbox visibility change
|
717 |
uploaded_file_input = gr.File(
|
718 |
-
label="آپلود فایل متنی",
|
719 |
file_types=['.txt'],
|
720 |
visible=False
|
721 |
)
|
@@ -728,24 +529,23 @@ with gr.Blocks(theme=app_theme_outer, css=custom_css, title="آلفا TTS") as d
|
|
728 |
speech_prompt_tb = gr.Textbox(
|
729 |
label="🗣️ سبک و زمینه گفتار (اختیاری)",
|
730 |
placeholder="مثال: با لحنی شاد و پرانرژی",
|
731 |
-
value="با لحنی دوستانه و رسا صحبت کن.",
|
732 |
lines=2
|
733 |
)
|
734 |
-
with gr.Column(scale=2):
|
735 |
speaker_voice_dd = gr.Dropdown(
|
736 |
SPEAKER_VOICES,
|
737 |
label="🎤 انتخاب گوینده",
|
738 |
-
value="Charon"
|
739 |
)
|
740 |
temperature_slider = gr.Slider(
|
741 |
-
minimum=0.1, maximum=1.5, step=0.05, value=0.9,
|
742 |
label="🌡️ میزان خلاقیت صدا (دما)"
|
743 |
)
|
744 |
gr.Markdown("<p class='temp-description-tts'>مقادیر بالاتر = تنوع بیشتر، مقادیر پایینتر = یکنواختی بیشتر.</p>", elem_classes=["temp-description-tts-container"])
|
745 |
|
746 |
output_audio = gr.Audio(label="🎧 فایل صوتی خروجی", type="filepath", elem_id="output_audio_tts")
|
747 |
|
748 |
-
# Button below the columns
|
749 |
generate_button = gr.Button("🚀 تولید و پخش صدا", variant="primary", elem_classes=["lg"])
|
750 |
|
751 |
gr.HTML("<hr class='custom-hr'>")
|
@@ -757,7 +557,7 @@ with gr.Blocks(theme=app_theme_outer, css=custom_css, title="آلفا TTS") as d
|
|
757 |
[False, None, "آیا میتوانم یک پیتزای پپرونی سفارش دهم؟", "پرسشی و مودبانه.", "Achird", 0.75],
|
758 |
],
|
759 |
inputs=[use_file_input_cb, uploaded_file_input, text_to_speak_tb, speech_prompt_tb, speaker_voice_dd, temperature_slider],
|
760 |
-
outputs=[output_audio, status_message_output],
|
761 |
fn=gradio_tts_interface,
|
762 |
cache_examples=os.getenv("GRADIO_CACHE_EXAMPLES", "False").lower() == "true",
|
763 |
label="💡 نمونههای کاربردی"
|
@@ -765,10 +565,9 @@ with gr.Blocks(theme=app_theme_outer, css=custom_css, title="آلفا TTS") as d
|
|
765 |
|
766 |
gr.Markdown("<p class='app-footer-fly'>Alpha TTS © 2024</p>")
|
767 |
|
768 |
-
# --- Event Handlers ---
|
769 |
def toggle_file_input(use_file):
|
770 |
if use_file:
|
771 |
-
return gr.update(visible=True, label=" "), gr.update(visible=False)
|
772 |
else:
|
773 |
return gr.update(visible=False), gr.update(visible=True, label="📝 متن فار��ی برای تبدیل به گفتار")
|
774 |
|
@@ -782,18 +581,16 @@ with gr.Blocks(theme=app_theme_outer, css=custom_css, title="آلفا TTS") as d
|
|
782 |
generate_button.click(
|
783 |
fn=gradio_tts_interface,
|
784 |
inputs=[use_file_input_cb, uploaded_file_input, text_to_speak_tb, speech_prompt_tb, speaker_voice_dd, temperature_slider],
|
785 |
-
outputs=[output_audio, status_message_output]
|
786 |
)
|
787 |
else:
|
788 |
logging.error("دکمه تولید صدا (generate_button) به درستی مقداردهی اولیه نشده است.")
|
789 |
|
790 |
|
791 |
if __name__ == "__main__":
|
792 |
-
|
793 |
-
if os.getenv("AUTO_RESTART_ENABLED", "true").lower() == "true": # Optional via env var
|
794 |
restart_scheduler_thread = threading.Thread(target=auto_restart_service, daemon=True)
|
795 |
restart_scheduler_thread.start()
|
796 |
-
# --- END: شروع ترد ریاستارت خودکار ---
|
797 |
|
798 |
demo.launch(
|
799 |
server_name="0.0.0.0",
|
|
|
78 |
"Rasalthgeti", "Orus", "Aoede", "Callirrhoe", "Autonoe", "Enceladus",
|
79 |
"Iapetus", "Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
|
80 |
]
|
81 |
+
FIXED_MODEL_NAME = "gemini-1.5-flash-preview-tts"
|
82 |
DEFAULT_MAX_CHUNK_SIZE = 3800
|
83 |
DEFAULT_SLEEP_BETWEEN_REQUESTS = 8
|
84 |
DEFAULT_OUTPUT_FILENAME_BASE = "alpha_tts_audio"
|
|
|
120 |
def smart_text_split(text, max_size=3800, log_list_ref=None):
|
121 |
if len(text) <= max_size: return [text]
|
122 |
chunks, current_chunk = [], ""
|
|
|
123 |
sentences = re.split(r'(?<=[.!?؟۔])\s+', text)
|
124 |
for sentence in sentences:
|
125 |
if len(current_chunk) + len(sentence) + 1 > max_size:
|
126 |
if current_chunk: chunks.append(current_chunk.strip())
|
127 |
current_chunk = sentence
|
128 |
+
while len(current_chunk) > max_size:
|
|
|
129 |
split_idx = -1
|
130 |
+
for punc in ['،', ',', ';', ':', ' ']:
|
131 |
idx = current_chunk.rfind(punc, max_size // 2, max_size)
|
132 |
if idx > split_idx : split_idx = idx
|
133 |
|
134 |
if split_idx != -1:
|
135 |
part, current_chunk = current_chunk[:split_idx+1], current_chunk[split_idx+1:]
|
136 |
+
else:
|
137 |
part, current_chunk = current_chunk[:max_size], current_chunk[max_size:]
|
138 |
chunks.append(part.strip())
|
139 |
else:
|
140 |
+
current_chunk += (" " if current_chunk and sentence else "") + sentence
|
141 |
if current_chunk: chunks.append(current_chunk.strip())
|
142 |
+
final_chunks = [c for c in chunks if c]
|
143 |
if log_list_ref: _log_tts(f"📊 متن به {len(final_chunks)} قطعه تقسیم شد.", log_list_ref)
|
144 |
return final_chunks
|
145 |
|
|
|
155 |
if os.path.exists(fp):
|
156 |
segment = AudioSegment.from_file(fp)
|
157 |
combined += segment
|
158 |
+
if i < len(file_paths) - 1:
|
159 |
+
combined += AudioSegment.silent(duration=150)
|
160 |
else:
|
161 |
_log_tts(f"⚠️ فایل صوتی برای ادغام یافت نشد: {fp}", log_list_ref)
|
162 |
|
163 |
+
combined.export(output_path, format="wav")
|
164 |
_log_tts(f"✅ فایل صوتی با موفقیت در '{output_path}' ادغام و ذخیره شد.", log_list_ref)
|
165 |
return True
|
166 |
except Exception as e:
|
|
|
179 |
return None, "خطا: کلید API جیمینای برای سرویس TTS در دسترس نیست."
|
180 |
|
181 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
genai.configure(api_key=api_key)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
183 |
except Exception as e:
|
184 |
_log_tts(f"❌ خطا در مقداردهی اولیه کلاینت Gemini: {e}", log_list_ref)
|
185 |
return None, f"خطا در ارتباط با Gemini: {e}"
|
|
|
197 |
for i, chunk in enumerate(text_chunks):
|
198 |
_log_tts(f"🔊 پردازش قطعه {i+1}/{len(text_chunks)}...", log_list_ref)
|
199 |
|
|
|
200 |
final_text_for_tts = f'"{prompt_input}"\n{chunk}' if prompt_input and prompt_input.strip() else chunk
|
201 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
202 |
try:
|
203 |
+
# The `selected_voice` from the dropdown is not currently used in this call.
|
204 |
+
# This would require knowing the specific API parameter for voice selection with this model.
|
205 |
+
# For now, the model will use its default voice or whatever behavior is programmed.
|
206 |
+
# A more advanced implementation would pass `selected_voice` to the API if possible.
|
207 |
+
|
208 |
+
# This is where the SyntaxError occurred. The `custom_config_for_tts` variable was an
|
209 |
+
# incomplete assignment. It's removed/commented out. The actual config is inline below.
|
210 |
+
#
|
211 |
+
# # custom_config_for_tts = genai_types.GenerationConfig( # This seems to be the new way # THIS LINE CAUSED SyntaxError
|
212 |
+
# temperature=temperature_val,
|
213 |
+
# # ... (rest of the commented out block) ...
|
214 |
+
# # )
|
215 |
|
216 |
+
tts_model = genai.GenerativeModel(FIXED_MODEL_NAME)
|
|
|
|
|
217 |
|
218 |
+
# Note: `selected_voice` is not used here yet. This means the dropdown for voice selection
|
219 |
+
# will not have an effect until this part is updated to correctly pass the voice
|
220 |
+
# to the Gemini API for the `FIXED_MODEL_NAME`.
|
221 |
+
# The `final_text_for_tts` includes the `prompt_input` for style.
|
222 |
+
response = tts_model.generate_content(
|
223 |
+
final_text_for_tts,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
224 |
generation_config=genai_types.GenerationConfig(
|
225 |
temperature=temperature_val,
|
226 |
+
response_mime_type="audio/wav"
|
227 |
),
|
|
|
|
|
|
|
|
|
228 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
fname_base = f"{output_base_name}_part{i+1:03d}"
|
231 |
|
232 |
+
audio_bytes = None
|
233 |
+
mime_type = None
|
234 |
+
|
235 |
+
if response.parts and hasattr(response.parts[0], 'blob') and response.parts[0].blob.mime_type.startswith("audio/"): # More common for new SDK
|
236 |
+
audio_bytes = response.parts[0].blob.data
|
237 |
+
mime_type = response.parts[0].blob.mime_type
|
|
|
|
|
238 |
elif response.candidates and response.candidates[0].content.parts and response.candidates[0].content.parts[0].inline_data: # AlphaTTS way
|
239 |
inline_data = response.candidates[0].content.parts[0].inline_data
|
240 |
audio_bytes = inline_data.data
|
241 |
mime_type = inline_data.mime_type
|
242 |
+
else:
|
243 |
audio_part = None
|
244 |
+
if response.parts:
|
245 |
+
for part in response.parts:
|
246 |
+
if hasattr(part, 'mime_type') and part.mime_type.startswith("audio/"): # Check for mime_type attr
|
247 |
+
audio_part = part
|
248 |
+
break
|
249 |
+
if audio_part and hasattr(audio_part, 'data'):
|
250 |
audio_bytes = audio_part.data
|
251 |
mime_type = audio_part.mime_type
|
252 |
+
elif audio_part and hasattr(audio_part, '_blob'):
|
253 |
audio_bytes = audio_part._blob.data
|
254 |
mime_type = audio_part._blob.mime_type
|
255 |
+
|
256 |
+
if not audio_bytes:
|
257 |
_log_tts(f"⚠️ پاسخ API برای قطعه {i+1} بدون داده صوتی معتبر دریافت شد.", log_list_ref)
|
258 |
+
_log_tts(f"ساختار پاسخ (Response structure): {response}", log_list_ref)
|
259 |
continue
|
260 |
|
261 |
+
if not mime_type: # Safety net if mime_type wasn't extracted
|
262 |
+
_log_tts(f"⚠️ MIME type برای قطعه {i+1} یافت نشد. پیشفرض wav.", log_list_ref)
|
263 |
+
mime_type = "audio/wav"
|
264 |
+
|
265 |
+
|
266 |
+
ext = mimetypes.guess_extension(mime_type) or ".wav"
|
267 |
+
if "audio/L" in mime_type and ext == ".wav":
|
268 |
audio_bytes = convert_to_wav(audio_bytes, mime_type)
|
269 |
if not ext.startswith("."): ext = "." + ext
|
270 |
|
|
|
274 |
|
275 |
except Exception as e:
|
276 |
_log_tts(f"❌ خطا در تولید قطعه صوتی {i+1} با Gemini: {e}\n{traceback.format_exc()}", log_list_ref)
|
|
|
277 |
if hasattr(e, 'response') and e.response:
|
278 |
+
_log_tts(f"جزئیات خطای Gemini API: {e.response}", log_list_ref)
|
279 |
+
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
280 |
|
281 |
if i < len(text_chunks) - 1 and len(text_chunks) > 1:
|
282 |
_log_tts(f"💤 توقف کوتاه ({sleep_time} ثانیه) قبل از پردازش قطعه بعدی...", log_list_ref)
|
|
|
289 |
_log_tts(f"🎉 {len(generated_files)} فایل(های) صوتی با موفقیت تولید شد.", log_list_ref)
|
290 |
|
291 |
final_audio_file = None
|
292 |
+
final_output_path_base = f"{output_base_name}_final"
|
293 |
|
294 |
if len(generated_files) > 1:
|
295 |
if PYDUB_AVAILABLE:
|
296 |
+
merged_fn = f"{final_output_path_base}.wav"
|
297 |
if os.path.exists(merged_fn):
|
298 |
try: os.remove(merged_fn)
|
299 |
except OSError as e: _log_tts(f"⚠️ عدم امکان حذف فایل ادغام شده قبلی '{merged_fn}': {e}", log_list_ref)
|
300 |
|
301 |
if merge_audio_files_func(generated_files, merged_fn, log_list_ref):
|
302 |
final_audio_file = merged_fn
|
|
|
303 |
for fp in generated_files:
|
304 |
if os.path.abspath(fp) != os.path.abspath(merged_fn):
|
305 |
try: os.remove(fp)
|
306 |
except OSError as e_del: _log_tts(f"⚠️ عدم امکان حذف فایل موقت '{fp}': {e_del}", log_list_ref)
|
307 |
+
else:
|
308 |
_log_tts("⚠️ ادغام فایلهای صوتی ناموفق بود. اولین قطعه ارائه میشود.", log_list_ref)
|
309 |
if generated_files:
|
310 |
try:
|
|
|
314 |
if os.path.exists(fallback_fn): os.remove(fallback_fn)
|
315 |
os.rename(first_chunk_path, fallback_fn)
|
316 |
final_audio_file = fallback_fn
|
|
|
317 |
for i_gf in range(1, len(generated_files)):
|
318 |
try: os.remove(generated_files[i_gf])
|
319 |
except OSError as e_del: _log_tts(f"⚠️ عدم امکان حذف فایل موقت '{generated_files[i_gf]}': {e_del}", log_list_ref)
|
320 |
except Exception as e_rename_fallback:
|
321 |
_log_tts(f"خطا در تغییر نام فایل اولین قطعه (fallback): {e_rename_fallback}", log_list_ref)
|
322 |
+
final_audio_file = generated_files[0]
|
323 |
+
else:
|
324 |
_log_tts("⚠️ Pydub برای ادغام در دسترس نیست. اولین قطعه صوتی ارائه میشود.", log_list_ref)
|
|
|
325 |
if generated_files:
|
326 |
try:
|
327 |
first_chunk_path = generated_files[0]
|
|
|
330 |
if os.path.exists(single_fallback_fn): os.remove(single_fallback_fn)
|
331 |
os.rename(first_chunk_path, single_fallback_fn)
|
332 |
final_audio_file = single_fallback_fn
|
|
|
|
|
|
|
333 |
for i_gf in range(1, len(generated_files)):
|
334 |
_log_tts(f"قطعه اضافی موجود: {generated_files[i_gf]} (ادغام نشده)", log_list_ref)
|
335 |
|
|
|
351 |
final_audio_file = final_single_fn
|
352 |
except Exception as e_rename_single:
|
353 |
_log_tts(f"خطا در تغییر نام فایل تکی نهایی: {e_rename_single}", log_list_ref)
|
354 |
+
final_audio_file = generated_files[0]
|
355 |
|
356 |
if final_audio_file and not os.path.exists(final_audio_file):
|
357 |
_log_tts(f"⚠️ فایل صوتی نهایی '{final_audio_file}' پس ��ز پردازش وجود ندارد!", log_list_ref)
|
358 |
return None, "خطا: فایل صوتی نهایی یافت نشد."
|
359 |
|
360 |
+
return final_audio_file, "موفق"
|
361 |
|
362 |
def gradio_tts_interface(use_file_input, uploaded_file, text_to_speak, speech_prompt, speaker_voice, temperature):
|
363 |
+
logs_for_this_run = []
|
|
|
|
|
|
|
364 |
actual_text = ""
|
365 |
status_message = "شروع پردازش..."
|
366 |
final_audio_path = None
|
|
|
390 |
_log_tts(f"تنظیمات: Speaker={speaker_voice}, Temp={temperature}, Prompt='{speech_prompt[:30]}...'", logs_for_this_run)
|
391 |
|
392 |
try:
|
|
|
393 |
final_audio_path, generation_status_msg = core_generate_audio(
|
394 |
actual_text, speech_prompt, speaker_voice, temperature, logs_for_this_run
|
395 |
)
|
|
|
398 |
status_message = "✅ تبدیل متن به گفتار با موفقیت انجام شد."
|
399 |
_log_tts(status_message, logs_for_this_run)
|
400 |
return final_audio_path, status_message
|
401 |
+
elif final_audio_path and generation_status_msg != "موفق":
|
402 |
status_message = f"⚠️ {generation_status_msg}. فایل صوتی ممکن است ناقص باشد: {final_audio_path}"
|
403 |
_log_tts(status_message, logs_for_this_run)
|
404 |
return final_audio_path, status_message
|
405 |
+
else:
|
406 |
status_message = f"❌ {generation_status_msg}"
|
407 |
_log_tts(status_message, logs_for_this_run)
|
408 |
return None, status_message
|
|
|
417 |
# --- START: بخش UI و Gradio (Adapted from Alpha Translator, content from Alpha TTS) ---
|
418 |
FLY_PRIMARY_COLOR_HEX = "#4F46E5"
|
419 |
FLY_SECONDARY_COLOR_HEX = "#10B981"
|
420 |
+
FLY_ACCENT_COLOR_HEX = "#D97706"
|
421 |
FLY_TEXT_COLOR_HEX = "#1F2937"
|
422 |
FLY_SUBTLE_TEXT_HEX = "#6B7280"
|
423 |
+
FLY_LIGHT_BACKGROUND_HEX = "#F9FAFB"
|
424 |
FLY_WHITE_HEX = "#FFFFFF"
|
425 |
FLY_BORDER_COLOR_HEX = "#D1D5DB"
|
426 |
FLY_INPUT_BG_HEX_SIMPLE = "#F3F4F6"
|
427 |
+
FLY_PANEL_BG_SIMPLE = "#E0F2FE"
|
428 |
|
429 |
app_theme_outer = gr.themes.Base(
|
430 |
+
font=[gr.themes.GoogleFont("Inter"), "system-ui", "sans-serif"],
|
431 |
).set(
|
432 |
+
body_background_fill=FLY_LIGHT_BACKGROUND_HEX,
|
433 |
)
|
434 |
|
|
|
435 |
custom_css = f"""
|
436 |
@import url('https://fonts.googleapis.com/css2?family=Vazirmatn:wght@300;400;500;600;700;800&display=swap');
|
437 |
@import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;500;600;700;800&display=swap');
|
|
|
443 |
--fly-bg-white: {FLY_WHITE_HEX}; --fly-border-color: {FLY_BORDER_COLOR_HEX};
|
444 |
--fly-input-bg-simple: {FLY_INPUT_BG_HEX_SIMPLE}; --fly-panel-bg-simple: {FLY_PANEL_BG_SIMPLE};
|
445 |
--font-global: 'Vazirmatn', 'Inter', 'Poppins', system-ui, sans-serif;
|
446 |
+
--font-english: 'Poppins', 'Inter', system-ui, sans-serif;
|
447 |
--radius-sm: 0.375rem; --radius-md: 0.5rem; --radius-lg: 0.75rem; --radius-xl: 1rem; --radius-full: 9999px;
|
448 |
--shadow-sm: 0 1px 2px 0 rgba(0,0,0,0.05); --shadow-md: 0 4px 6px -1px rgba(0,0,0,0.1),0 2px 4px -2px rgba(0,0,0,0.1);
|
449 |
--shadow-lg: 0 10px 15px -3px rgba(0,0,0,0.1),0 4px 6px -4px rgba(0,0,0,0.1);
|
|
|
460 |
footer,.gradio-footer,.flagging-container,.flex.row.gap-2.absolute.bottom-2.right-2.gr-compact.gr-box.gr-text-gray-500,div[data-testid="flag"],button[title="Flag"],button[aria-label="Flag"],.footer-utils {{display:none !important;visibility:hidden !important;}}
|
461 |
.main-content-area {{flex-grow:1;padding:0.75rem;width:100%;margin:0 auto;box-sizing:border-box;}}
|
462 |
.content-panel-simple {{background-color:var(--fly-bg-white);padding:1rem;border-radius:var(--radius-xl);box-shadow:var(--shadow-xl);margin-top:-2rem;position:relative;z-index:10;margin-bottom:2rem;width:100%;box-sizing:border-box;}}
|
|
|
463 |
.content-panel-simple .gr-button.lg.primary,.content-panel-simple button[variant="primary"] {{background:var(--fly-accent) !important;margin-top:1rem !important;padding:12px 20px !important;transition:all 0.25s ease-in-out !important;color:white !important;font-weight:600 !important;border-radius:10px !important;border:none !important;box-shadow:0 3px 8px -1px rgba(var(--fly-accent-rgb),0.3) !important;width:100% !important;font-size:1em !important;display:flex;align-items:center;justify-content:center;}}
|
464 |
.content-panel-simple .gr-button.lg.primary:hover,.content-panel-simple button[variant="primary"]:hover {{background:#B45309 !important;transform:translateY(-1px) !important;box-shadow:0 5px 10px -1px rgba(var(--fly-accent-rgb),0.4) !important;}}
|
|
|
465 |
.content-panel-simple .gr-input > label + div > textarea,.content-panel-simple .gr-dropdown > label + div > div > input,.content-panel-simple .gr-dropdown > label + div > div > select,.content-panel-simple .gr-textbox > label + div > textarea, .content-panel-simple .gr-file > label + div {{border-radius:8px !important;border:1.5px solid var(--fly-border-color) !important;font-size:0.95em !important;background-color:var(--fly-input-bg-simple) !important;padding:10px 12px !important;color:var(--fly-text-primary) !important;}}
|
466 |
.content-panel-simple .gr-input > label + div > textarea:focus,.content-panel-simple .gr-dropdown > label + div > div > input:focus,.content-panel-simple .gr-dropdown > label + div > div > select:focus,.content-panel-simple .gr-textbox > label + div > textarea:focus, .content-panel-simple .gr-file > label + div:focus-within {{border-color:var(--fly-primary) !important;box-shadow:0 0 0 3px rgba(var(--fly-primary-rgb),0.12) !important;background-color:var(--fly-bg-white) !important;}}
|
467 |
+
.content-panel-simple .gr-file > label + div {{ text-align:center; border-style: dashed !important; }}
|
468 |
.content-panel-simple .gr-dropdown select {{font-family:var(--font-global) !important;width:100%;cursor:pointer;}}
|
|
|
469 |
.content-panel-simple .gr-textbox[label*="وضعیت"] > label + div > textarea {{background-color:var(--fly-panel-bg-simple) !important;border-color:#A5D5FE !important;min-height:80px;font-family:var(--font-global);font-size:0.9em !important;line-height:1.5;padding:10px !important;}}
|
|
|
470 |
.content-panel-simple .gr-panel,.content-panel-simple div[label*="تنظیمات پیشرفته"] > .gr-accordion > .gr-panel {{border-radius:8px !important;border:1px solid var(--fly-border-color) !important;background-color:var(--fly-input-bg-simple) !important;padding:0.8rem 1rem !important;margin-top:0.6rem;box-shadow:none;}}
|
471 |
.content-panel-simple div[label*="تنظیمات پیشرفته"] > .gr-accordion > button.gr-button {{font-weight:500 !important;padding:8px 10px !important;border-radius:6px !important;background-color:#E5E7EB !important;color:var(--fly-text-primary) !important;border:1px solid #D1D5DB !important;}}
|
|
|
472 |
.content-panel-simple label > span.label-text {{font-weight:500 !important;color:#4B5563 !important;font-size:0.88em !important;margin-bottom:6px !important;display:inline-block;}}
|
|
|
473 |
.content-panel-simple .gr-slider label span {{font-size:0.82em !important;color:var(--fly-text-secondary);}}
|
|
|
474 |
.temp-description-tts {{ font-size: 0.82em !important; color: var(--fly-text-secondary) !important; margin-top: -0.5rem; margin-bottom: 1rem; padding-right: 5px; }}
|
|
|
475 |
.content-panel-simple div[label*="نمونه"] {{margin-top:1.5rem;}}
|
476 |
.content-panel-simple div[label*="نمونه"] .gr-button.gr-button-tool,.content-panel-simple div[label*="نمونه"] .gr-sample-button {{background-color:#E0E7FF !important;color:var(--fly-primary) !important;border-radius:6px !important;font-size:0.78em !important;padding:4px 8px !important;}}
|
477 |
.content-panel-simple .custom-hr {{height:1px;background-color:var(--fly-border-color);margin:1.5rem 0;border:none;}}
|
478 |
.api-warning-message {{background-color:#FFFBEB !important;color:#92400E !important;padding:10px 12px !important;border-radius:8px !important;border:1px solid #FDE68A !important;text-align:center !important;margin:0 0.2rem 1rem 0.2rem !important;font-size:0.85em !important;}}
|
|
|
479 |
.content-panel-simple #output_audio_tts audio {{ width: 100%; border-radius: var(--radius-md); margin-top:0.5rem; }}
|
480 |
@media (min-width:640px) {{.main-content-area {{padding:1.5rem;max-width:700px;}} .content-panel-simple {{padding:1.5rem;}} .app-title-card h1 {{font-size:2.5em !important;}} .app-title-card p {{font-size:1.05em !important;}} }}
|
481 |
@media (min-width:768px) {{
|
482 |
.main-content-area {{max-width:780px;}} .content-panel-simple {{padding:2rem;}}
|
483 |
.content-panel-simple .main-content-row {{display:flex !important;flex-direction:row !important;gap:1.5rem !important;}}
|
484 |
+
.content-panel-simple .main-content-row > .gr-column:nth-child(1) {{flex-basis:60%; min-width:0;}}
|
485 |
+
.content-panel-simple .main-content-row > .gr-column:nth-child(2) {{flex-basis:40%; min-width:0;}}
|
486 |
.content-panel-simple .gr-button.lg.primary,.content-panel-simple button[variant="primary"] {{width:auto !important;align-self:flex-start;}}
|
487 |
.app-title-card h1 {{font-size:2.75em !important;}} .app-title-card p {{font-size:1.1em !important;}}
|
488 |
}}
|
|
|
513 |
status_message_output = gr.Textbox(label="وضعیت پردازش", interactive=False, lines=1, placeholder="پیامهای وضعیت اینجا نمایش داده میشوند...")
|
514 |
|
515 |
with gr.Row(elem_classes=["main-content-row"]):
|
516 |
+
with gr.Column(scale=3):
|
517 |
use_file_input_cb = gr.Checkbox(label="📄 استفاده از فایل متنی (.txt)", value=False)
|
|
|
518 |
uploaded_file_input = gr.File(
|
519 |
+
label="آپلود فایل متنی",
|
520 |
file_types=['.txt'],
|
521 |
visible=False
|
522 |
)
|
|
|
529 |
speech_prompt_tb = gr.Textbox(
|
530 |
label="🗣️ سبک و زمینه گفتار (اختیاری)",
|
531 |
placeholder="مثال: با لحنی شاد و پرانرژی",
|
532 |
+
value="با لحنی دوستانه و رسا صحبت کن.",
|
533 |
lines=2
|
534 |
)
|
535 |
+
with gr.Column(scale=2):
|
536 |
speaker_voice_dd = gr.Dropdown(
|
537 |
SPEAKER_VOICES,
|
538 |
label="🎤 انتخاب گوینده",
|
539 |
+
value="Charon"
|
540 |
)
|
541 |
temperature_slider = gr.Slider(
|
542 |
+
minimum=0.1, maximum=1.5, step=0.05, value=0.9,
|
543 |
label="🌡️ میزان خلاقیت صدا (دما)"
|
544 |
)
|
545 |
gr.Markdown("<p class='temp-description-tts'>مقادیر بالاتر = تنوع بیشتر، مقادیر پایینتر = یکنواختی بیشتر.</p>", elem_classes=["temp-description-tts-container"])
|
546 |
|
547 |
output_audio = gr.Audio(label="🎧 فایل صوتی خروجی", type="filepath", elem_id="output_audio_tts")
|
548 |
|
|
|
549 |
generate_button = gr.Button("🚀 تولید و پخش صدا", variant="primary", elem_classes=["lg"])
|
550 |
|
551 |
gr.HTML("<hr class='custom-hr'>")
|
|
|
557 |
[False, None, "آیا میتوانم یک پیتزای پپرونی سفارش دهم؟", "پرسشی و مودبانه.", "Achird", 0.75],
|
558 |
],
|
559 |
inputs=[use_file_input_cb, uploaded_file_input, text_to_speak_tb, speech_prompt_tb, speaker_voice_dd, temperature_slider],
|
560 |
+
outputs=[output_audio, status_message_output],
|
561 |
fn=gradio_tts_interface,
|
562 |
cache_examples=os.getenv("GRADIO_CACHE_EXAMPLES", "False").lower() == "true",
|
563 |
label="💡 نمونههای کاربردی"
|
|
|
565 |
|
566 |
gr.Markdown("<p class='app-footer-fly'>Alpha TTS © 2024</p>")
|
567 |
|
|
|
568 |
def toggle_file_input(use_file):
|
569 |
if use_file:
|
570 |
+
return gr.update(visible=True, label=" "), gr.update(visible=False)
|
571 |
else:
|
572 |
return gr.update(visible=False), gr.update(visible=True, label="📝 متن فار��ی برای تبدیل به گفتار")
|
573 |
|
|
|
581 |
generate_button.click(
|
582 |
fn=gradio_tts_interface,
|
583 |
inputs=[use_file_input_cb, uploaded_file_input, text_to_speak_tb, speech_prompt_tb, speaker_voice_dd, temperature_slider],
|
584 |
+
outputs=[output_audio, status_message_output]
|
585 |
)
|
586 |
else:
|
587 |
logging.error("دکمه تولید صدا (generate_button) به درستی مقداردهی اولیه نشده است.")
|
588 |
|
589 |
|
590 |
if __name__ == "__main__":
|
591 |
+
if os.getenv("AUTO_RESTART_ENABLED", "true").lower() == "true":
|
|
|
592 |
restart_scheduler_thread = threading.Thread(target=auto_restart_service, daemon=True)
|
593 |
restart_scheduler_thread.start()
|
|
|
594 |
|
595 |
demo.launch(
|
596 |
server_name="0.0.0.0",
|