Update app.py
Browse files
app.py
CHANGED
@@ -8,9 +8,9 @@ import time
|
|
8 |
import zipfile
|
9 |
from google import genai
|
10 |
from google.genai import types
|
|
|
11 |
|
12 |
-
#
|
13 |
-
# The Space's runtime will inject this environment variable if the secret is set.
|
14 |
HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
15 |
|
16 |
try:
|
@@ -18,10 +18,9 @@ try:
|
|
18 |
PYDUB_AVAILABLE = True
|
19 |
except ImportError:
|
20 |
PYDUB_AVAILABLE = False
|
21 |
-
print("⚠️ pydub
|
22 |
-
print("If merging is desired, ensure pydub is in requirements.txt and ffmpeg is available in the environment.")
|
23 |
|
24 |
-
# ---
|
25 |
SPEAKER_VOICES = [
|
26 |
"Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
|
27 |
"Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
|
@@ -30,145 +29,128 @@ SPEAKER_VOICES = [
|
|
30 |
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
|
31 |
]
|
32 |
MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
|
33 |
-
|
34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
35 |
def save_binary_file(file_name, data):
|
36 |
abs_file_name = os.path.abspath(file_name)
|
37 |
try:
|
38 |
with open(abs_file_name, "wb") as f:
|
39 |
f.write(data)
|
40 |
-
print(f"✅
|
41 |
return abs_file_name
|
42 |
except Exception as e:
|
43 |
-
print(f"❌
|
44 |
return None
|
45 |
|
46 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
47 |
parameters = parse_audio_mime_type(mime_type)
|
48 |
bits_per_sample = parameters["bits_per_sample"]
|
49 |
sample_rate = parameters["rate"]
|
50 |
-
num_channels = 1
|
51 |
data_size = len(audio_data)
|
52 |
bytes_per_sample = bits_per_sample // 8
|
53 |
block_align = num_channels * bytes_per_sample
|
54 |
byte_rate = sample_rate * block_align
|
55 |
-
chunk_size = 36 + data_size
|
56 |
-
|
57 |
header = struct.pack(
|
58 |
"<4sI4s4sIHHIIHH4sI",
|
59 |
-
b"RIFF", chunk_size, b"WAVE", b"fmt ", 16,
|
60 |
-
|
61 |
-
num_channels, sample_rate, byte_rate, block_align, bits_per_sample,
|
62 |
-
b"data", data_size
|
63 |
)
|
64 |
return header + audio_data
|
65 |
|
66 |
def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
|
67 |
-
bits_per_sample = 16
|
68 |
-
rate = 24000
|
69 |
if mime_type:
|
70 |
mime_type_lower = mime_type.lower()
|
71 |
parts = mime_type_lower.split(";")
|
72 |
for param in parts:
|
73 |
param = param.strip()
|
74 |
if param.startswith("rate="):
|
|
|
|
|
|
|
75 |
try:
|
76 |
-
rate_str = param.split("=", 1)[1]
|
77 |
-
rate = int(rate_str)
|
78 |
-
except (ValueError, IndexError): pass
|
79 |
-
elif param.startswith("audio/l"): # e.g., audio/L16 or audio/L24
|
80 |
-
try:
|
81 |
-
# Attempt to parse bits from "L<bits>"
|
82 |
potential_bits = param.split("l", 1)[1]
|
83 |
-
if potential_bits.isdigit():
|
84 |
-
|
85 |
-
except (ValueError, IndexError): pass
|
86 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
87 |
|
88 |
def load_text_from_gr_file(file_obj):
|
89 |
if file_obj is None:
|
90 |
-
return "", "
|
91 |
try:
|
92 |
with open(file_obj.name, 'r', encoding='utf-8') as f:
|
93 |
content = f.read().strip()
|
94 |
if not content:
|
95 |
-
return "", "
|
96 |
-
return content, f"
|
97 |
except Exception as e:
|
98 |
-
return "", f"
|
99 |
|
100 |
def smart_text_split(text, max_size=3800):
|
101 |
-
if len(text) <= max_size:
|
102 |
-
|
103 |
-
|
104 |
-
current_chunk = ""
|
105 |
-
sentences = re.split(r'(?<=[.!?])\s+', text) # Split by sentences
|
106 |
for sentence in sentences:
|
107 |
if not sentence: continue
|
108 |
-
|
109 |
-
|
110 |
-
if current_chunk: # If there's something in current_chunk, add it
|
111 |
-
chunks.append(current_chunk.strip())
|
112 |
-
current_chunk = "" # Reset current_chunk
|
113 |
-
|
114 |
-
# If the sentence itself is too long, split it by words or even characters
|
115 |
if len(sentence) > max_size:
|
116 |
-
words = sentence.split(' ')
|
117 |
-
temp_sentence_part = ""
|
118 |
for word in words:
|
119 |
-
if len(
|
120 |
-
if
|
121 |
-
# If word itself is too long (rare for TTS practical limits)
|
122 |
if len(word) > max_size:
|
123 |
-
for i in range(0, len(word), max_size):
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
else: # Sentence is not too long itself, start a new chunk with it
|
133 |
-
current_chunk = sentence
|
134 |
-
else: # Sentence fits, add to current_chunk
|
135 |
-
current_chunk += (" " if current_chunk else "") + sentence
|
136 |
-
|
137 |
-
if current_chunk: # Add any remaining part
|
138 |
-
chunks.append(current_chunk.strip())
|
139 |
return chunks
|
140 |
|
141 |
-
|
142 |
def merge_audio_files_func(file_paths, output_path):
|
143 |
-
if not PYDUB_AVAILABLE:
|
144 |
-
|
145 |
-
if not file_paths:
|
146 |
-
return False, "No audio files to merge.", None
|
147 |
try:
|
148 |
combined = AudioSegment.empty()
|
149 |
for i, file_path in enumerate(file_paths):
|
150 |
if os.path.exists(file_path):
|
151 |
try:
|
152 |
-
|
153 |
-
# Assuming all inputs are WAV due to our conversion logic
|
154 |
-
audio = AudioSegment.from_file(file_path, format="wav")
|
155 |
combined += audio
|
156 |
-
if i < len(file_paths) - 1:
|
157 |
-
combined += AudioSegment.silent(duration=200) # Small silence
|
158 |
except Exception as e_load:
|
159 |
-
|
160 |
-
|
|
|
161 |
else:
|
162 |
-
|
163 |
-
|
164 |
-
return False,
|
165 |
-
|
166 |
abs_output_path = os.path.abspath(output_path)
|
167 |
combined.export(abs_output_path, format="wav")
|
168 |
-
return True, f"
|
169 |
except Exception as e:
|
170 |
-
|
171 |
-
|
|
|
172 |
|
173 |
def create_zip_file(file_paths, zip_name):
|
174 |
abs_zip_name = os.path.abspath(zip_name)
|
@@ -177,85 +159,75 @@ def create_zip_file(file_paths, zip_name):
|
|
177 |
for file_path in file_paths:
|
178 |
if os.path.exists(file_path):
|
179 |
zipf.write(file_path, os.path.basename(file_path))
|
180 |
-
return True, f"ZIP
|
181 |
except Exception as e:
|
182 |
-
return False, f"
|
183 |
|
184 |
-
# ---
|
185 |
def generate_audio_for_gradio(
|
186 |
-
# api_key_input_field is removed, will use HF_GEMINI_API_KEY
|
187 |
use_file_input_checkbox, text_file_obj,
|
188 |
speech_prompt_input, text_to_speak_input,
|
189 |
max_chunk_slider, sleep_slider, temperature_slider,
|
190 |
-
|
|
|
191 |
merge_checkbox, delete_partials_checkbox,
|
192 |
-
# Progress for Gradio (optional but good for long tasks)
|
193 |
progress=gr.Progress(track_tqdm=True)
|
194 |
):
|
195 |
status_messages = []
|
196 |
-
status_messages.append("🚀
|
197 |
-
progress(0, desc="
|
198 |
|
199 |
-
# 1. API Key Validation (from HF Secrets)
|
200 |
api_key_to_use = HF_GEMINI_API_KEY
|
201 |
if not api_key_to_use:
|
202 |
-
|
203 |
-
|
204 |
-
# For now, let's assume if HF_GEMINI_API_KEY is None, we raise an error.
|
205 |
-
status_messages.append("❌ Error: GEMINI_API_KEY not found in Hugging Face Secrets.")
|
206 |
-
status_messages.append("➡️ Please set it in your Space's Settings > Secrets.")
|
207 |
return None, None, "\n".join(status_messages)
|
208 |
|
209 |
-
os.environ["GEMINI_API_KEY"] = api_key_to_use
|
210 |
-
status_messages.append("🔑 API
|
211 |
|
212 |
-
# 2. Determine Text Input
|
213 |
actual_text_input = ""
|
214 |
if use_file_input_checkbox:
|
215 |
if text_file_obj is None:
|
216 |
-
status_messages.append("❌
|
217 |
return None, None, "\n".join(status_messages)
|
218 |
actual_text_input, msg = load_text_from_gr_file(text_file_obj)
|
219 |
status_messages.append(msg)
|
220 |
-
if not actual_text_input:
|
221 |
-
return None, None, "\n".join(status_messages)
|
222 |
else:
|
223 |
actual_text_input = text_to_speak_input
|
224 |
-
status_messages.append("⌨️
|
225 |
|
226 |
if not actual_text_input or actual_text_input.strip() == "":
|
227 |
-
status_messages.append("❌
|
228 |
return None, None, "\n".join(status_messages)
|
229 |
|
230 |
-
# 3. Initialize GenAI Client
|
231 |
try:
|
232 |
-
status_messages.append("🛠️
|
233 |
-
progress(0.1, desc="
|
234 |
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
|
235 |
-
status_messages.append("✅
|
236 |
except Exception as e:
|
237 |
-
status_messages.append(f"❌
|
238 |
return None, None, "\n".join(status_messages)
|
239 |
|
240 |
-
# 4. Split text
|
241 |
text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
|
242 |
-
status_messages.append(f"📊
|
243 |
-
for i,
|
244 |
-
status_messages.append(f" 📝
|
245 |
|
246 |
-
# 5. Generate audio for each chunk
|
247 |
generated_audio_files = []
|
248 |
run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
|
249 |
temp_output_dir = f"temp_audio_{run_id}"
|
250 |
os.makedirs(temp_output_dir, exist_ok=True)
|
251 |
-
output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input)
|
252 |
|
253 |
total_chunks = len(text_chunks)
|
254 |
for i, chunk_text_content in enumerate(text_chunks):
|
255 |
-
progress_val = 0.1 + (0.7 * (i / total_chunks))
|
256 |
-
progress(progress_val, desc=f"
|
257 |
|
258 |
-
status_messages.append(f"\n🔊
|
259 |
final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
|
260 |
|
261 |
contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
|
@@ -264,17 +236,16 @@ def generate_audio_for_gradio(
|
|
264 |
response_modalities=["audio"],
|
265 |
speech_config=types.SpeechConfig(
|
266 |
voice_config=types.VoiceConfig(
|
267 |
-
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=speaker_dropdown)
|
268 |
)
|
269 |
)
|
270 |
)
|
271 |
try:
|
272 |
chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
|
273 |
chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
|
274 |
-
|
275 |
audio_data_received = False
|
276 |
for stream_response_chunk in client.models.generate_content_stream(
|
277 |
-
model=
|
278 |
):
|
279 |
if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
|
280 |
stream_response_chunk.candidates[0].content.parts and
|
@@ -284,161 +255,136 @@ def generate_audio_for_gradio(
|
|
284 |
data_buffer = inline_data.data
|
285 |
api_mime_type = inline_data.mime_type
|
286 |
audio_data_received = True
|
287 |
-
|
288 |
-
status_messages.append(f"ℹ️ API returned MIME type: {api_mime_type}")
|
289 |
|
290 |
-
|
291 |
-
file_extension = ".wav" # Default to .wav and convert
|
292 |
if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
|
293 |
file_extension = ".mp3"
|
294 |
-
|
295 |
-
# pydub will need ffmpeg to read MP3 for merging.
|
296 |
-
status_messages.append(f"ℹ️ Saving as MP3 based on MIME: {api_mime_type}")
|
297 |
elif api_mime_type and "wav" in api_mime_type.lower() and \
|
298 |
not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
|
299 |
file_extension = ".wav"
|
300 |
-
|
301 |
-
|
302 |
-
else: # Raw PCM (like audio/L16), unknown, or .bin -> convert to WAV
|
303 |
file_extension = ".wav"
|
304 |
-
status_messages.append(f"ℹ️
|
305 |
data_buffer = convert_to_wav(data_buffer, api_mime_type)
|
306 |
|
307 |
-
status_messages.append(f"ℹ️
|
308 |
|
309 |
generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
|
310 |
if generated_file_path:
|
311 |
generated_audio_files.append(generated_file_path)
|
312 |
-
status_messages.append(f"✅
|
313 |
else:
|
314 |
-
status_messages.append(f"❌
|
315 |
-
break
|
316 |
-
|
317 |
elif stream_response_chunk.text:
|
318 |
-
status_messages.append(f"ℹ️
|
319 |
|
320 |
if not audio_data_received:
|
321 |
-
status_messages.append(f"❌
|
322 |
-
# Check for errors in the stream response if available
|
323 |
if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
|
324 |
-
status_messages.append(f"🛑
|
325 |
-
|
326 |
-
|
327 |
except types.BlockedPromptException as bpe:
|
328 |
-
status_messages.append(f"❌
|
329 |
-
status_messages.append(f"
|
330 |
except types.StopCandidateException as sce:
|
331 |
-
status_messages.append(f"❌
|
332 |
-
status_messages.append(f"
|
333 |
except Exception as e:
|
334 |
-
status_messages.append(f"❌
|
335 |
-
|
336 |
-
|
337 |
-
continue
|
338 |
|
339 |
-
if i < total_chunks - 1:
|
340 |
-
status_messages.append(f"⏱️
|
341 |
time.sleep(float(sleep_slider))
|
342 |
|
343 |
-
progress(0.85, desc="
|
344 |
-
# 6. Handle output files
|
345 |
if not generated_audio_files:
|
346 |
-
status_messages.append("❌
|
347 |
final_status = "\n".join(status_messages)
|
348 |
print(final_status)
|
349 |
-
progress(1, desc="
|
350 |
return None, None, final_status
|
351 |
|
352 |
-
status_messages.append(f"\n🎉 {len(generated_audio_files)}
|
353 |
-
|
354 |
-
output_audio_path_for_player = None
|
355 |
-
output_path_for_download = None
|
356 |
-
|
357 |
-
if merge_checkbox and len(generated_audio_files) > 1:
|
358 |
-
|
359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
360 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
361 |
status_messages.append(msg_zip)
|
362 |
if success_zip: output_path_for_download = zip_p
|
363 |
-
else:
|
364 |
-
status_messages.append(f"🔗 Merging {len(generated_audio_files)} files (all should be WAVs now)...")
|
365 |
-
# Ensure all files for merging are WAV, convert if any MP3s were saved and pydub is used
|
366 |
-
# For simplicity, our save logic now tries to make them WAV if not MP3 from API.
|
367 |
-
# If an MP3 was saved and PYDUB_AVAILABLE, it should handle it.
|
368 |
-
|
369 |
-
merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
|
370 |
-
success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
|
371 |
-
status_messages.append(msg_merge)
|
372 |
-
if success_merge:
|
373 |
-
output_audio_path_for_player = merged_p
|
374 |
-
output_path_for_download = merged_p
|
375 |
-
if delete_partials_checkbox:
|
376 |
-
status_messages.append("🗑️ Deleting partial files...")
|
377 |
-
for file_p in generated_audio_files:
|
378 |
-
try: os.remove(file_p); status_messages.append(f" 🗑️ Deleted: {os.path.basename(file_p)}")
|
379 |
-
except Exception as e_del: status_messages.append(f" ⚠️ Could not delete {os.path.basename(file_p)}: {e_del}")
|
380 |
-
else:
|
381 |
-
status_messages.append("⚠️ Merge failed. Providing ZIP of parts.")
|
382 |
-
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
383 |
-
status_messages.append(msg_zip)
|
384 |
-
if success_zip: output_path_for_download = zip_p
|
385 |
elif len(generated_audio_files) == 1:
|
386 |
-
# Single file, should be WAV due to our conversion logic or MP3 if API sent that
|
387 |
single_file_path = generated_audio_files[0]
|
388 |
-
|
389 |
-
# Convert MP3 to WAV for Gradio player if it prefers WAV
|
390 |
-
# Or, gr.Audio might handle MP3 directly. Let's test.
|
391 |
-
# For now, assume gr.Audio handles common types.
|
392 |
-
output_audio_path_for_player = single_file_path
|
393 |
-
status_messages.append(f"🎵 Single MP3 file: {os.path.basename(single_file_path)}")
|
394 |
-
else: # Assume WAV
|
395 |
-
output_audio_path_for_player = single_file_path
|
396 |
-
status_messages.append(f"🎵 Single WAV file: {os.path.basename(single_file_path)}")
|
397 |
output_path_for_download = single_file_path
|
398 |
-
|
399 |
-
|
|
|
|
|
|
|
400 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
401 |
status_messages.append(msg_zip)
|
402 |
if success_zip: output_path_for_download = zip_p
|
403 |
-
|
404 |
final_status = "\n".join(status_messages)
|
405 |
print(final_status)
|
406 |
-
print(f"DEBUG
|
407 |
-
print(f"DEBUG
|
408 |
-
progress(1, desc="
|
409 |
return output_audio_path_for_player, output_path_for_download, final_status
|
410 |
|
411 |
-
# ---
|
412 |
-
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
|
413 |
-
gr.Markdown("# 🎵 Gemini
|
|
|
414 |
if not HF_GEMINI_API_KEY:
|
415 |
gr.Warning(
|
416 |
-
"
|
417 |
-
"
|
418 |
-
"Name the secret `GEMINI_API_KEY`."
|
419 |
)
|
420 |
else:
|
421 |
-
gr.Info("
|
422 |
|
423 |
gr.Markdown(
|
424 |
-
"
|
425 |
-
"
|
426 |
-
"\n\
|
427 |
)
|
428 |
|
429 |
with gr.Row():
|
430 |
-
with gr.Column(scale=
|
431 |
-
|
|
|
432 |
text_file = gr.File(
|
433 |
-
label="
|
434 |
file_types=['.txt'],
|
435 |
-
visible=False
|
436 |
)
|
437 |
text_to_speak = gr.Textbox(
|
438 |
-
label="📝
|
439 |
lines=10,
|
440 |
-
placeholder="
|
441 |
-
visible=True
|
|
|
442 |
)
|
443 |
use_file.change(
|
444 |
lambda x: (gr.update(visible=x), gr.update(visible=not x)),
|
@@ -446,105 +392,111 @@ with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary
|
|
446 |
[text_file, text_to_speak]
|
447 |
)
|
448 |
speech_prompt = gr.Textbox(
|
449 |
-
label="🗣️
|
450 |
-
placeholder="
|
451 |
-
info="
|
|
|
452 |
)
|
453 |
|
454 |
-
with gr.Column(scale=
|
455 |
-
|
456 |
-
|
|
|
|
|
|
|
|
|
|
|
457 |
)
|
458 |
-
|
459 |
-
|
|
|
|
|
|
|
|
|
460 |
)
|
461 |
-
|
462 |
-
minimum=0.0, maximum=1.0, step=0.05, value=0.7,
|
463 |
-
label="🌡️ Temperature",
|
464 |
-
info="
|
465 |
)
|
466 |
-
|
467 |
minimum=1000, maximum=4000, step=100, value=3800,
|
468 |
-
label="🧩
|
469 |
-
info="
|
470 |
)
|
471 |
-
|
472 |
-
minimum=
|
473 |
-
label="⏱️
|
474 |
-
info="
|
475 |
)
|
476 |
-
|
477 |
-
label="💾
|
478 |
)
|
479 |
|
480 |
-
with gr.Group(
|
481 |
-
|
482 |
-
|
483 |
-
|
484 |
-
|
485 |
-
|
486 |
-
|
|
|
|
|
487 |
|
488 |
|
489 |
-
submit_button = gr.Button("✨
|
490 |
|
|
|
491 |
with gr.Row():
|
492 |
with gr.Column(scale=1):
|
493 |
-
|
494 |
with gr.Column(scale=1):
|
495 |
-
|
496 |
|
497 |
-
|
498 |
|
499 |
submit_button.click(
|
500 |
fn=generate_audio_for_gradio,
|
501 |
inputs=[
|
502 |
use_file, text_file, speech_prompt, text_to_speak,
|
503 |
-
|
504 |
-
|
505 |
-
|
506 |
],
|
507 |
-
outputs=[
|
508 |
)
|
509 |
|
510 |
gr.Markdown("---")
|
511 |
-
#
|
512 |
-
|
|
|
513 |
try:
|
514 |
-
|
515 |
-
gr.Markdown(f"<p style='text-align:center; font-size:small;'><em>{
|
516 |
-
except Exception
|
517 |
-
print(f"Error decoding/displaying credit: {e_decode}")
|
518 |
-
pass
|
519 |
|
520 |
gr.Examples(
|
|
|
521 |
examples=[
|
522 |
-
[False, None, "
|
523 |
-
[False, None, "
|
524 |
-
[True, "sample_text.txt", "A calm storyteller.", "", 3500, 3, 0.6, MODELS[0], "Vindemiatrix", "example_from_file", True, False]
|
525 |
],
|
526 |
-
fn=generate_audio_for_gradio,
|
527 |
-
inputs=[
|
528 |
use_file, text_file, speech_prompt, text_to_speak,
|
529 |
-
|
530 |
-
|
531 |
-
|
532 |
],
|
533 |
-
outputs=[
|
534 |
-
cache_examples=False #
|
535 |
)
|
536 |
-
gr.Markdown("<small>To use the 'example_from_file', please create a `sample_text.txt` file in the root of this Space with some text content, or upload your own text file.</small>")
|
537 |
-
|
538 |
|
|
|
539 |
if __name__ == "__main__":
|
540 |
if not PYDUB_AVAILABLE:
|
541 |
-
print("
|
542 |
if not HF_GEMINI_API_KEY:
|
543 |
-
print("
|
544 |
|
545 |
-
|
546 |
-
# or set the GEMINI_API_KEY environment variable before running.
|
547 |
-
# e.g., export GEMINI_API_KEY="your_key_here"
|
548 |
-
# then run python app.py
|
549 |
-
|
550 |
-
demo.launch(debug=True, share=False) # share=False for local, HF Spaces handles public link
|
|
|
8 |
import zipfile
|
9 |
from google import genai
|
10 |
from google.genai import types
|
11 |
+
import traceback # برای نمایش خطاهای دقیقتر
|
12 |
|
13 |
+
# خواندن کلید API از Hugging Face Secrets
|
|
|
14 |
HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
|
15 |
|
16 |
try:
|
|
|
18 |
PYDUB_AVAILABLE = True
|
19 |
except ImportError:
|
20 |
PYDUB_AVAILABLE = False
|
21 |
+
print("⚠️ کتابخانه pydub در دسترس نیست. قابلیت ادغام فایلهای صوتی غیرفعال خواهد بود.")
|
|
|
22 |
|
23 |
+
# --- ثابتها ---
|
24 |
SPEAKER_VOICES = [
|
25 |
"Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
|
26 |
"Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
|
|
|
29 |
"Zephyr", "Puck", "Charon", "Kore", "Fenrir", "Leda"
|
30 |
]
|
31 |
MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
|
32 |
+
MODEL_NAMES_FARSI = {
|
33 |
+
"gemini-2.5-flash-preview-tts": "جمینای ۲.۵ فلش (سریعتر، اقتصادیتر)",
|
34 |
+
"gemini-2.5-pro-preview-tts": "جمینای ۲.۵ پرو (کیفیت بالاتر)"
|
35 |
+
}
|
36 |
+
SPEAKER_VOICES_FARSI_SAMPLE = { # میتوانید برای همه گویندهها نام فارسی تعریف کنید
|
37 |
+
"Charon": "شارون (پیشفرض)",
|
38 |
+
"Achernar": "آخرالنهر",
|
39 |
+
"Vindemiatrix": "vindemiatrix (ستارهشناس)",
|
40 |
+
# ... بقیه گویندهها
|
41 |
+
}
|
42 |
+
|
43 |
+
|
44 |
+
# --- توابع کمکی (بدون تغییر زیاد در منطق، فقط پیامها فارسی میشوند) ---
|
45 |
def save_binary_file(file_name, data):
|
46 |
abs_file_name = os.path.abspath(file_name)
|
47 |
try:
|
48 |
with open(abs_file_name, "wb") as f:
|
49 |
f.write(data)
|
50 |
+
print(f"✅ فایل در مسیر ذخیره شد: {abs_file_name}")
|
51 |
return abs_file_name
|
52 |
except Exception as e:
|
53 |
+
print(f"❌ خطا در ذخیره فایل {abs_file_name}: {e}")
|
54 |
return None
|
55 |
|
56 |
def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
|
57 |
parameters = parse_audio_mime_type(mime_type)
|
58 |
bits_per_sample = parameters["bits_per_sample"]
|
59 |
sample_rate = parameters["rate"]
|
60 |
+
num_channels = 1
|
61 |
data_size = len(audio_data)
|
62 |
bytes_per_sample = bits_per_sample // 8
|
63 |
block_align = num_channels * bytes_per_sample
|
64 |
byte_rate = sample_rate * block_align
|
65 |
+
chunk_size = 36 + data_size
|
|
|
66 |
header = struct.pack(
|
67 |
"<4sI4s4sIHHIIHH4sI",
|
68 |
+
b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, 1, num_channels,
|
69 |
+
sample_rate, byte_rate, block_align, bits_per_sample, b"data", data_size
|
|
|
|
|
70 |
)
|
71 |
return header + audio_data
|
72 |
|
73 |
def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
|
74 |
+
bits_per_sample = 16
|
75 |
+
rate = 24000
|
76 |
if mime_type:
|
77 |
mime_type_lower = mime_type.lower()
|
78 |
parts = mime_type_lower.split(";")
|
79 |
for param in parts:
|
80 |
param = param.strip()
|
81 |
if param.startswith("rate="):
|
82 |
+
try: rate = int(param.split("=", 1)[1])
|
83 |
+
except: pass
|
84 |
+
elif param.startswith("audio/l"):
|
85 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
potential_bits = param.split("l", 1)[1]
|
87 |
+
if potential_bits.isdigit(): bits_per_sample = int(potential_bits)
|
88 |
+
except: pass
|
|
|
89 |
return {"bits_per_sample": bits_per_sample, "rate": rate}
|
90 |
|
91 |
def load_text_from_gr_file(file_obj):
|
92 |
if file_obj is None:
|
93 |
+
return "", "فایلی برای ورودی متن ارائه نشده است."
|
94 |
try:
|
95 |
with open(file_obj.name, 'r', encoding='utf-8') as f:
|
96 |
content = f.read().strip()
|
97 |
if not content:
|
98 |
+
return "", "فایل متنی خالی است."
|
99 |
+
return content, f"متن با موفقیت از فایل '{os.path.basename(file_obj.name)}' ({len(content)} کاراکتر) بارگذاری شد."
|
100 |
except Exception as e:
|
101 |
+
return "", f"خطا در خواندن فایل متنی: {e}"
|
102 |
|
103 |
def smart_text_split(text, max_size=3800):
|
104 |
+
if len(text) <= max_size: return [text]
|
105 |
+
chunks, current_chunk = [], ""
|
106 |
+
sentences = re.split(r'(?<=[.!?؟])\s+', text)
|
|
|
|
|
107 |
for sentence in sentences:
|
108 |
if not sentence: continue
|
109 |
+
if len(current_chunk) + len(sentence) + 1 > max_size:
|
110 |
+
if current_chunk: chunks.append(current_chunk.strip())
|
|
|
|
|
|
|
|
|
|
|
111 |
if len(sentence) > max_size:
|
112 |
+
words, temp_part = sentence.split(' '), ""
|
|
|
113 |
for word in words:
|
114 |
+
if len(temp_part) + len(word) + 1 > max_size:
|
115 |
+
if temp_part: chunks.append(temp_part.strip())
|
|
|
116 |
if len(word) > max_size:
|
117 |
+
for i in range(0, len(word), max_size): chunks.append(word[i:i+max_size])
|
118 |
+
temp_part = ""
|
119 |
+
else: temp_part = word
|
120 |
+
else: temp_part += (" " if temp_part else "") + word
|
121 |
+
if temp_part: chunks.append(temp_part.strip())
|
122 |
+
current_chunk = ""
|
123 |
+
else: current_chunk = sentence
|
124 |
+
else: current_chunk += (" " if current_chunk else "") + sentence
|
125 |
+
if current_chunk: chunks.append(current_chunk.strip())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
126 |
return chunks
|
127 |
|
|
|
128 |
def merge_audio_files_func(file_paths, output_path):
|
129 |
+
if not PYDUB_AVAILABLE: return False, "pydub در دسترس نیست. امکان ادغام فایلها وجود ندارد.", None
|
130 |
+
if not file_paths: return False, "هیچ فایل صوتی برای ادغام وجود ندارد.", None
|
|
|
|
|
131 |
try:
|
132 |
combined = AudioSegment.empty()
|
133 |
for i, file_path in enumerate(file_paths):
|
134 |
if os.path.exists(file_path):
|
135 |
try:
|
136 |
+
audio = AudioSegment.from_file(file_path, format="wav") # فرض میکنیم ورودیها WAV هستند
|
|
|
|
|
137 |
combined += audio
|
138 |
+
if i < len(file_paths) - 1: combined += AudioSegment.silent(duration=200)
|
|
|
139 |
except Exception as e_load:
|
140 |
+
msg = f"خطا در بارگذاری فایل صوتی '{os.path.basename(file_path)}' با pydub: {e_load}"
|
141 |
+
print(f"⚠️ {msg}")
|
142 |
+
return False, msg, None
|
143 |
else:
|
144 |
+
msg = f"فایل برای ادغام یافت نشد: {os.path.basename(file_path)}"
|
145 |
+
print(f"⚠️ {msg}")
|
146 |
+
return False, msg, None
|
|
|
147 |
abs_output_path = os.path.abspath(output_path)
|
148 |
combined.export(abs_output_path, format="wav")
|
149 |
+
return True, f"فایل ادغام شده با موفقیت در '{os.path.basename(abs_output_path)}' ذخیره شد.", abs_output_path
|
150 |
except Exception as e:
|
151 |
+
msg = f"خطا در ادغام فایلها: {e}"
|
152 |
+
print(f"❌ {msg}")
|
153 |
+
return False, msg, None
|
154 |
|
155 |
def create_zip_file(file_paths, zip_name):
|
156 |
abs_zip_name = os.path.abspath(zip_name)
|
|
|
159 |
for file_path in file_paths:
|
160 |
if os.path.exists(file_path):
|
161 |
zipf.write(file_path, os.path.basename(file_path))
|
162 |
+
return True, f"فایل ZIP با نام '{os.path.basename(abs_zip_name)}' ایجاد شد.", abs_zip_name
|
163 |
except Exception as e:
|
164 |
+
return False, f"خطا در ایجاد فایل ZIP: {e}", None
|
165 |
|
166 |
+
# --- تابع اصلی تولید صدا ---
|
167 |
def generate_audio_for_gradio(
|
|
|
168 |
use_file_input_checkbox, text_file_obj,
|
169 |
speech_prompt_input, text_to_speak_input,
|
170 |
max_chunk_slider, sleep_slider, temperature_slider,
|
171 |
+
model_dropdown_key, # کلید مدل (انگلیسی)
|
172 |
+
speaker_dropdown, output_filename_base_input,
|
173 |
merge_checkbox, delete_partials_checkbox,
|
|
|
174 |
progress=gr.Progress(track_tqdm=True)
|
175 |
):
|
176 |
status_messages = []
|
177 |
+
status_messages.append("🚀 فرآیند تبدیل متن به گفتار آغاز شد...")
|
178 |
+
progress(0, desc="در حال آمادهسازی...")
|
179 |
|
|
|
180 |
api_key_to_use = HF_GEMINI_API_KEY
|
181 |
if not api_key_to_use:
|
182 |
+
status_messages.append("❌ خطا: کلید API جمینای (GEMINI_API_KEY) در تنظیمات Secret این Space یافت نشد.")
|
183 |
+
status_messages.append("⬅️ لطفاً آن را در بخش Settings > Secrets مربوط به این Space تنظیم کنید.")
|
|
|
|
|
|
|
184 |
return None, None, "\n".join(status_messages)
|
185 |
|
186 |
+
os.environ["GEMINI_API_KEY"] = api_key_to_use
|
187 |
+
status_messages.append("🔑 کلید API با موفقیت از Secrets بارگذاری شد.")
|
188 |
|
|
|
189 |
actual_text_input = ""
|
190 |
if use_file_input_checkbox:
|
191 |
if text_file_obj is None:
|
192 |
+
status_messages.append("❌ خطا: گزینه 'استفاده از فایل متنی' انتخاب شده، اما هیچ فایلی آپلود نشده است.")
|
193 |
return None, None, "\n".join(status_messages)
|
194 |
actual_text_input, msg = load_text_from_gr_file(text_file_obj)
|
195 |
status_messages.append(msg)
|
196 |
+
if not actual_text_input: return None, None, "\n".join(status_messages)
|
|
|
197 |
else:
|
198 |
actual_text_input = text_to_speak_input
|
199 |
+
status_messages.append("⌨️ از متن وارد شده به صورت دستی استفاده میشود.")
|
200 |
|
201 |
if not actual_text_input or actual_text_input.strip() == "":
|
202 |
+
status_messages.append("❌ خطا: متن ورودی خالی است.")
|
203 |
return None, None, "\n".join(status_messages)
|
204 |
|
|
|
205 |
try:
|
206 |
+
status_messages.append("🛠️ در حال مقداردهی اولیه کلاینت جمینای...")
|
207 |
+
progress(0.1, desc="اتصال به جمینای...")
|
208 |
client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
|
209 |
+
status_messages.append("✅ کلاینت جمینای با موفقیت ایجاد شد.")
|
210 |
except Exception as e:
|
211 |
+
status_messages.append(f"❌ خطا در ایجاد کلاینت جمینای: {e}")
|
212 |
return None, None, "\n".join(status_messages)
|
213 |
|
|
|
214 |
text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
|
215 |
+
status_messages.append(f"📊 متن به {len(text_chunks)} قطعه تقسیم شد.")
|
216 |
+
for i, chunk_text_content in enumerate(text_chunks):
|
217 |
+
status_messages.append(f" 📝 قطعه {i+1}: {len(chunk_text_content)} کاراکتر")
|
218 |
|
|
|
219 |
generated_audio_files = []
|
220 |
run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
|
221 |
temp_output_dir = f"temp_audio_{run_id}"
|
222 |
os.makedirs(temp_output_dir, exist_ok=True)
|
223 |
+
output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input)
|
224 |
|
225 |
total_chunks = len(text_chunks)
|
226 |
for i, chunk_text_content in enumerate(text_chunks):
|
227 |
+
progress_val = 0.1 + (0.7 * (i / total_chunks))
|
228 |
+
progress(progress_val, desc=f"در حال تولید قطعه {i+1} از {total_chunks}...")
|
229 |
|
230 |
+
status_messages.append(f"\n🔊 در حال تولید صدا برای قطعه {i+1}/{total_chunks}...")
|
231 |
final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
|
232 |
|
233 |
contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
|
|
|
236 |
response_modalities=["audio"],
|
237 |
speech_config=types.SpeechConfig(
|
238 |
voice_config=types.VoiceConfig(
|
239 |
+
prebuilt_voice_config=types.PrebuiltVoiceConfig(voice_name=speaker_dropdown) # گوینده از ورودی
|
240 |
)
|
241 |
)
|
242 |
)
|
243 |
try:
|
244 |
chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
|
245 |
chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
|
|
|
246 |
audio_data_received = False
|
247 |
for stream_response_chunk in client.models.generate_content_stream(
|
248 |
+
model=model_dropdown_key, contents=contents_for_api, config=generate_content_config,
|
249 |
):
|
250 |
if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
|
251 |
stream_response_chunk.candidates[0].content.parts and
|
|
|
255 |
data_buffer = inline_data.data
|
256 |
api_mime_type = inline_data.mime_type
|
257 |
audio_data_received = True
|
258 |
+
status_messages.append(f"ℹ️ MIME Type دریافتی از API: {api_mime_type}")
|
|
|
259 |
|
260 |
+
file_extension = ".wav"
|
|
|
261 |
if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
|
262 |
file_extension = ".mp3"
|
263 |
+
status_messages.append(f"ℹ️ ذخیره با فرمت MP3 بر اساس MIME Type: {api_mime_type}")
|
|
|
|
|
264 |
elif api_mime_type and "wav" in api_mime_type.lower() and \
|
265 |
not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
|
266 |
file_extension = ".wav"
|
267 |
+
status_messages.append(f"ℹ️ ذخیره با فرمت WAV بر اساس MIME Type: {api_mime_type}")
|
268 |
+
else:
|
|
|
269 |
file_extension = ".wav"
|
270 |
+
status_messages.append(f"ℹ️ تبدیل به فرمت WAV برای MIME Type: {api_mime_type or 'نامشخص'}")
|
271 |
data_buffer = convert_to_wav(data_buffer, api_mime_type)
|
272 |
|
273 |
+
status_messages.append(f"ℹ️ پسوند فایل نهایی: {file_extension}")
|
274 |
|
275 |
generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
|
276 |
if generated_file_path:
|
277 |
generated_audio_files.append(generated_file_path)
|
278 |
+
status_messages.append(f"✅ قطعه {i+1} ذخیره شد: {os.path.basename(generated_file_path)}")
|
279 |
else:
|
280 |
+
status_messages.append(f"❌ عدم موفقیت در ذخیره قطعه {i+1}.")
|
281 |
+
break
|
|
|
282 |
elif stream_response_chunk.text:
|
283 |
+
status_messages.append(f"ℹ️ پیام متنی از API (حین استریم): {stream_response_chunk.text}")
|
284 |
|
285 |
if not audio_data_received:
|
286 |
+
status_messages.append(f"❌ هیچ داده صوتی برای قطعه {i+1} دریافت نشد.")
|
|
|
287 |
if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
|
288 |
+
status_messages.append(f"🛑 دلیل مسدود شدن توسط API: {stream_response_chunk.prompt_feedback.block_reason_message or stream_response_chunk.prompt_feedback.block_reason}")
|
|
|
|
|
289 |
except types.BlockedPromptException as bpe:
|
290 |
+
status_messages.append(f"❌ محتوای قطعه {i+1} توسط API مسدود شد: {bpe}")
|
291 |
+
status_messages.append(f" بازخورد API: {bpe.response.prompt_feedback}")
|
292 |
except types.StopCandidateException as sce:
|
293 |
+
status_messages.append(f"❌ تولید صدا برای قطعه {i+1} متوقف شد: {sce}")
|
294 |
+
status_messages.append(f" بازخورد API: {sce.response.prompt_feedback}")
|
295 |
except Exception as e:
|
296 |
+
status_messages.append(f"❌ خطا در تولید/پردازش قطعه {i+1}: {type(e).__name__} - {e}")
|
297 |
+
status_messages.append(traceback.format_exc())
|
298 |
+
continue
|
|
|
299 |
|
300 |
+
if i < total_chunks - 1 and float(sleep_slider) > 0 :
|
301 |
+
status_messages.append(f"⏱️ انتظار به مدت {sleep_slider} ثانیه...")
|
302 |
time.sleep(float(sleep_slider))
|
303 |
|
304 |
+
progress(0.85, desc="پردازش فایلهای نهایی...")
|
|
|
305 |
if not generated_audio_files:
|
306 |
+
status_messages.append("❌ هیچ فایل صوتی با موفقیت تولید یا ذخیره نشد!")
|
307 |
final_status = "\n".join(status_messages)
|
308 |
print(final_status)
|
309 |
+
progress(1, desc="پایان با خطا.")
|
310 |
return None, None, final_status
|
311 |
|
312 |
+
status_messages.append(f"\n🎉 {len(generated_audio_files)} فایل(های) صوتی تولید شد!")
|
313 |
+
|
314 |
+
output_audio_path_for_player = None
|
315 |
+
output_path_for_download = None
|
316 |
+
|
317 |
+
if merge_checkbox and len(generated_audio_files) > 1 and PYDUB_AVAILABLE:
|
318 |
+
status_messages.append(f"🔗 در حال ادغام {len(generated_audio_files)} فایل صوتی...")
|
319 |
+
merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
|
320 |
+
success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
|
321 |
+
status_messages.append(msg_merge)
|
322 |
+
if success_merge:
|
323 |
+
output_audio_path_for_player = merged_p
|
324 |
+
output_path_for_download = merged_p
|
325 |
+
if delete_partials_checkbox:
|
326 |
+
status_messages.append("🗑️ در حال حذف فایلهای جزئی...")
|
327 |
+
for file_p in generated_audio_files:
|
328 |
+
try: os.remove(file_p); status_messages.append(f" 🗑️ حذف شد: {os.path.basename(file_p)}")
|
329 |
+
except Exception as e_del: status_messages.append(f" ⚠️ عدم موفقیت در حذف {os.path.basename(file_p)}: {e_del}")
|
330 |
+
else:
|
331 |
+
status_messages.append("⚠️ ادغام ناموفق بود. فایل ZIP از قطعات ارائه میشود.")
|
332 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
333 |
status_messages.append(msg_zip)
|
334 |
if success_zip: output_path_for_download = zip_p
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
335 |
elif len(generated_audio_files) == 1:
|
|
|
336 |
single_file_path = generated_audio_files[0]
|
337 |
+
output_audio_path_for_player = single_file_path
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
output_path_for_download = single_file_path
|
339 |
+
status_messages.append(f"🎵 فایل صوتی تکی: {os.path.basename(single_file_path)}")
|
340 |
+
elif len(generated_audio_files) > 1: # No merge or pydub not available
|
341 |
+
if not PYDUB_AVAILABLE and merge_checkbox:
|
342 |
+
status_messages.append("⚠️ pydub در دسترس نیست، امکان ادغام وجود ندارد. فایل ZIP ارائه میشود.")
|
343 |
+
status_messages.append("📦 چندین قطعه تولید شد. در حال ایجاد فایل ZIP...")
|
344 |
success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
|
345 |
status_messages.append(msg_zip)
|
346 |
if success_zip: output_path_for_download = zip_p
|
347 |
+
|
348 |
final_status = "\n".join(status_messages)
|
349 |
print(final_status)
|
350 |
+
print(f"DEBUG مسیر فایل برای پخش کننده: {output_audio_path_for_player}")
|
351 |
+
print(f"DEBUG مسیر فایل برای دانلود: {output_path_for_download}")
|
352 |
+
progress(1, desc="انجام شد!")
|
353 |
return output_audio_path_for_player, output_path_for_download, final_status
|
354 |
|
355 |
+
# --- تعریف رابط کاربری Gradio ---
|
356 |
+
with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky), title="تبدیل متن به گفتار با Gemini") as demo:
|
357 |
+
gr.Markdown("# 🎵 تبدیل متن به گفتار با Gemini API 🗣️", elem_id="app-title")
|
358 |
+
|
359 |
if not HF_GEMINI_API_KEY:
|
360 |
gr.Warning(
|
361 |
+
"کلید API جمینای (GEMINI_API_KEY) در Hugging Face Secrets یافت نشد. "
|
362 |
+
"لطفاً آن را در بخش 'Settings' > 'Secrets' این Space با نام `GEMINI_API_KEY` اضافه کنید تا برنامه کار کند."
|
|
|
363 |
)
|
364 |
else:
|
365 |
+
gr.Info("کلید API جمینای با موفقیت از Secrets بارگذاری شد. آماده تولید صدا!")
|
366 |
|
367 |
gr.Markdown(
|
368 |
+
"این ابزار متن شما را با استفاده از API قدرتمند Gemini گوگل به گفتار تبدیل میکند. "
|
369 |
+
"برای استفاده، باید کلید API جمینای خود را در بخش Secrets این Space تنظیم کرده باشید."
|
370 |
+
"\n\nمیتوانید کلید API خود را از [استودیوی هوش مصنوعی گوگل (Google AI Studio)](https://aistudio.google.com/app/apikey) دریافت کنید."
|
371 |
)
|
372 |
|
373 |
with gr.Row():
|
374 |
+
with gr.Column(scale=3): # ستون ورودی متن
|
375 |
+
gr.Markdown("### ۱. متن ورودی")
|
376 |
+
use_file = gr.Checkbox(label="📁 استفاده از فایل متنی (.txt) به جای ورود دستی", value=False)
|
377 |
text_file = gr.File(
|
378 |
+
label="آپلود فایل متنی",
|
379 |
file_types=['.txt'],
|
380 |
+
visible=False
|
381 |
)
|
382 |
text_to_speak = gr.Textbox(
|
383 |
+
label="📝 متنی که میخواهید به گفتار تبدیل شود:",
|
384 |
lines=10,
|
385 |
+
placeholder="متن خود را در اینجا وارد کنید یا فایل متنی را در بالا آپلود نمایید...",
|
386 |
+
visible=True,
|
387 |
+
text_align="right" # برای متون فارسی
|
388 |
)
|
389 |
use_file.change(
|
390 |
lambda x: (gr.update(visible=x), gr.update(visible=not x)),
|
|
|
392 |
[text_file, text_to_speak]
|
393 |
)
|
394 |
speech_prompt = gr.Textbox(
|
395 |
+
label="🗣️ فرمان سبک گفتار (اختیاری)",
|
396 |
+
placeholder="مثال: «با لحنی دوستانه و پرانرژی، مانند یک مجری پادکست صحبت کن»",
|
397 |
+
info="این فرمان به تنظیم سبک، احساسات و ویژگیهای صدای خروجی کمک میکند.",
|
398 |
+
text_align="right"
|
399 |
)
|
400 |
|
401 |
+
with gr.Column(scale=2): # ستون تنظیمات
|
402 |
+
gr.Markdown("### ۲. تنظیمات تولید صدا")
|
403 |
+
# تبدیل دیکشنری نامهای فارسی به لیست تاپلها برای Dropdown
|
404 |
+
model_choices_farsi = [(MODEL_NAMES_FARSI[key], key) for key in MODELS]
|
405 |
+
model_name_dropdown = gr.Dropdown(
|
406 |
+
choices=model_choices_farsi, # نمایش نام فارسی، ارسال کلید انگلیسی
|
407 |
+
label="🤖 انتخاب مدل Gemini",
|
408 |
+
value=MODELS[0] # مقدار پیشفرض کلید انگلیسی
|
409 |
)
|
410 |
+
|
411 |
+
speaker_choices_farsi = [(SPEAKER_VOICES_FARSI_SAMPLE.get(v, v), v) for v in SPEAKER_VOICES]
|
412 |
+
speaker_voice_dropdown = gr.Dropdown(
|
413 |
+
choices=speaker_choices_farsi,
|
414 |
+
label="🎤 انتخاب گوینده",
|
415 |
+
value="Charon"
|
416 |
)
|
417 |
+
temperature_slider = gr.Slider(
|
418 |
+
minimum=0.0, maximum=1.0, step=0.05, value=0.7,
|
419 |
+
label="🌡️ دمای مدل (Temperature)",
|
420 |
+
info="میزان خلاقیت و تنوع در خروجی (0.0 تا 1.0). مقادیر بالاتر = تنوع بیشتر."
|
421 |
)
|
422 |
+
max_chunk_size_slider = gr.Slider(
|
423 |
minimum=1000, maximum=4000, step=100, value=3800,
|
424 |
+
label="🧩 حداکثر کاراکتر در هر قطعه",
|
425 |
+
info="متن برای ارسال به API به قطعات کوچکتر تقسیم میشود."
|
426 |
)
|
427 |
+
sleep_between_requests_slider = gr.Slider(
|
428 |
+
minimum=0, maximum=15, step=0.5, value=1, # کاهش مقدار پیشفرض برای سرعت بیشتر
|
429 |
+
label="⏱️ تاخیر بین درخواستها (ثانیه)",
|
430 |
+
info="برای مدیریت محدودیتهای API (مثلاً Gemini Flash دارای محدودیت ۶۰ درخواست در دقیقه است)."
|
431 |
)
|
432 |
+
output_filename_base_input = gr.Textbox(
|
433 |
+
label="💾 نام پایه فایل خروجی", value="gemini_tts_farsi"
|
434 |
)
|
435 |
|
436 |
+
with gr.Group(elem_id="merge-options"):
|
437 |
+
gr.Markdown("تنظیمات ادغام (در صورت تولید بیش از یک قطعه):")
|
438 |
+
merge_audio_checkbox = gr.Checkbox(label="🔗 ادغام قطعات صوتی", value=True, visible=PYDUB_AVAILABLE)
|
439 |
+
delete_partials_checkbox = gr.Checkbox(label="🗑️ حذف قطعات پس از ادغام", value=True, visible=PYDUB_AVAILABLE and True) # نمایش اگر ادغام فعال و pydub موجود باشد
|
440 |
+
|
441 |
+
if PYDUB_AVAILABLE:
|
442 |
+
merge_audio_checkbox.change(lambda x: gr.update(visible=x), [merge_audio_checkbox], [delete_partials_checkbox])
|
443 |
+
else:
|
444 |
+
gr.Markdown("<p style='color:orange; font-size:small;'>⚠️ قابلیت ادغام فایلها به دلیل عدم دسترسی به کتابخانه `pydub` غیرفعال است.</p>")
|
445 |
|
446 |
|
447 |
+
submit_button = gr.Button("✨ تولید فایل صوتی ✨", variant="primary", elem_id="submit-button-main")
|
448 |
|
449 |
+
gr.Markdown("### ۳. خروجی")
|
450 |
with gr.Row():
|
451 |
with gr.Column(scale=1):
|
452 |
+
output_audio_player_component = gr.Audio(label="🎧 فایل صوتی تولید شده (قابل پخش)", type="filepath")
|
453 |
with gr.Column(scale=1):
|
454 |
+
output_file_download_component = gr.File(label="📥 دانلود فایل خروجی (صوتی یا ZIP)", type="filepath")
|
455 |
|
456 |
+
status_textbox_component = gr.Textbox(label="📊 گزارش وضعیت و پیامها", lines=10, interactive=False, max_lines=20, text_align="right")
|
457 |
|
458 |
submit_button.click(
|
459 |
fn=generate_audio_for_gradio,
|
460 |
inputs=[
|
461 |
use_file, text_file, speech_prompt, text_to_speak,
|
462 |
+
max_chunk_size_slider, sleep_between_requests_slider, temperature_slider,
|
463 |
+
model_name_dropdown, speaker_voice_dropdown, output_filename_base_input,
|
464 |
+
merge_audio_checkbox, delete_partials_checkbox
|
465 |
],
|
466 |
+
outputs=[output_audio_player_component, output_file_download_component, status_textbox_component]
|
467 |
)
|
468 |
|
469 |
gr.Markdown("---")
|
470 |
+
# اطلاعات سازنده
|
471 |
+
encoded_text_creator = "Q3JlYXRlZCBieSA6IEhhbWVkNzQ0IChBSUdPTERFTikgZm9yIEh1Z2dpbmcgRmFjZSBTcGFjZXMu"
|
472 |
+
# "Created by : Hamed744 (AIGOLDEN) for Hugging Face Spaces."
|
473 |
try:
|
474 |
+
decoded_text_creator = base64.b64decode(encoded_text_creator.encode('utf-8')).decode('utf-8')
|
475 |
+
gr.Markdown(f"<p style='text-align:center; font-size:small; color:grey;'><em>{decoded_text_creator}</em></p>")
|
476 |
+
except Exception: pass
|
|
|
|
|
477 |
|
478 |
gr.Examples(
|
479 |
+
label="چند مثال برای شروع:",
|
480 |
examples=[
|
481 |
+
[False, None, "یک راوی مهربان و آموزنده.", "سلام دنیا! این یک آزمایش برای تبدیل متن به گفتار با جمینای در گریدیا است. امیدوارم خوب کار کند!", 3800, 1, 0.7, MODELS[0], "Charon", "hello_farsi", True, True],
|
482 |
+
[False, None, "یک گزارشگر خبری هیجانزده.", "خبر فوری! هوش مصنوعی اکنون میتواند گفتاری شبیه به انسان با وضوح باورنکردنی تولید کند. این فناوری به سر��ت در حال پیشرفت است!", 3000, 1, 0.8, MODELS[1], "Achernar", "news_farsi", True, True],
|
|
|
483 |
],
|
484 |
+
fn=generate_audio_for_gradio,
|
485 |
+
inputs=[
|
486 |
use_file, text_file, speech_prompt, text_to_speak,
|
487 |
+
max_chunk_size_slider, sleep_between_requests_slider, temperature_slider,
|
488 |
+
model_name_dropdown, speaker_voice_dropdown, output_filename_base_input,
|
489 |
+
merge_audio_checkbox, delete_partials_checkbox
|
490 |
],
|
491 |
+
outputs=[output_audio_player_component, output_file_download_component, status_textbox_component],
|
492 |
+
cache_examples=False # چون با API کار میکند و ورودیها داینامیک هستند
|
493 |
)
|
|
|
|
|
494 |
|
495 |
+
# اجرای برنامه در صورت اجرای مستقیم فایل (برای تست محلی)
|
496 |
if __name__ == "__main__":
|
497 |
if not PYDUB_AVAILABLE:
|
498 |
+
print("هشدار: کتابخانه pydub نصب نشده یا کار نمیکند. قابلیت ادغام فایلهای صوتی غیرفعال خواهد بود.")
|
499 |
if not HF_GEMINI_API_KEY:
|
500 |
+
print("هشدار: متغیر محیطی GEMINI_API_KEY تنظیم نشده است. اگر برنامه برای کلید API به آن متکی باشد، ممکن است در حالت محلی کار نکند.")
|
501 |
|
502 |
+
demo.launch(debug=True, share=False) # share=False برای اجرای محلی، هاگینگ فیس لینک عمومی را مدیریت میکند
|
|
|
|
|
|
|
|
|
|