Spaces:

Hamed744
/

Ttspro

Running

App Files Files Community

Hamed744 commited on Jun 4

Commit

3f42665

verified ·

1 Parent(s): 6ca7d8f

Update app.py

Browse files

Files changed (1) hide show

app.py +287 -203

app.py CHANGED Viewed

@@ -9,14 +9,19 @@ import zipfile
 from google import genai
 from google.genai import types
 try:
     from pydub import AudioSegment
     PYDUB_AVAILABLE = True
 except ImportError:
     PYDUB_AVAILABLE = False
-    print("⚠️ pydub is not available. Audio files will be saved separately.")
-# --- Constants (previously from @param) ---
 SPEAKER_VOICES = [
     "Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
     "Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
@@ -26,51 +31,61 @@ SPEAKER_VOICES = [
 ]
 MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
-# --- Helper functions (mostly unchanged, minor adaptations if any) ---
 def save_binary_file(file_name, data):
-    # Ensure we are writing to a path Gradio can access (usually current dir is fine for temp files)
     abs_file_name = os.path.abspath(file_name)
-    with open(abs_file_name, "wb") as f:
-        f.write(data)
-    print(f"✅ File saved at: {abs_file_name}")
-    return abs_file_name
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
     parameters = parse_audio_mime_type(mime_type)
     bits_per_sample = parameters["bits_per_sample"]
     sample_rate = parameters["rate"]
-    num_channels = 1
     data_size = len(audio_data)
     bytes_per_sample = bits_per_sample // 8
     block_align = num_channels * bytes_per_sample
     byte_rate = sample_rate * block_align
-    chunk_size = 36 + data_size
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
-        b"RIFF", chunk_size, b"WAVE", b"fmt ", 16, 1, num_channels,
-        sample_rate, byte_rate, block_align, bits_per_sample, b"data", data_size
     )
     return header + audio_data
 def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
-    bits_per_sample = 16
-    rate = 24000
-    parts = mime_type.split(";")
-    for param in parts:
-        param = param.strip()
-        if param.lower().startswith("rate="):
-            try:
-                rate_str = param.split("=", 1)[1]
-                rate = int(rate_str)
-            except (ValueError, IndexError): pass
-        elif param.startswith("audio/L"):
-            try:
-                bits_per_sample = int(param.split("L", 1)[1])
-            except (ValueError, IndexError): pass
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def load_text_from_gr_file(file_obj):
-    """Load text from a Gradio file object."""
     if file_obj is None:
         return "", "No file provided for text input."
     try:
@@ -78,7 +93,6 @@ def load_text_from_gr_file(file_obj):
             content = f.read().strip()
         if not content:
             return "", "Text file is empty."
-        print(f"📖 Text loaded from file: {len(content)} characters")
         return content, f"Successfully loaded {len(content)} chars from {os.path.basename(file_obj.name)}."
     except Exception as e:
         return "", f"Error reading text file: {e}"
@@ -88,25 +102,39 @@ def smart_text_split(text, max_size=3800):
         return [text]
     chunks = []
     current_chunk = ""
-    sentences = re.split(r'(?<=[.!?])\s+', text)
     for sentence in sentences:
-        if len(current_chunk) + len(sentence) + 1 > max_size:
-            if current_chunk:
                 chunks.append(current_chunk.strip())
-            # Simplified logic for very long sentences (can be improved)
             if len(sentence) > max_size:
-                # Split long sentence further
-                start = 0
-                while start < len(sentence):
-                    end = start + max_size
-                    chunks.append(sentence[start:end].strip())
-                    start = end
-                current_chunk = ""
-            else:
                 current_chunk = sentence
-        else:
             current_chunk += (" " if current_chunk else "") + sentence
-    if current_chunk:
         chunks.append(current_chunk.strip())
     return chunks
@@ -114,24 +142,32 @@ def smart_text_split(text, max_size=3800):
 def merge_audio_files_func(file_paths, output_path):
     if not PYDUB_AVAILABLE:
         return False, "pydub is not available. Cannot merge files.", None
     try:
-        print(f"🔗 Merging {len(file_paths)} audio files...")
         combined = AudioSegment.empty()
         for i, file_path in enumerate(file_paths):
             if os.path.exists(file_path):
-                print(f"📎 Adding file {i+1}: {file_path}")
-                audio = AudioSegment.from_file(file_path)
-                combined += audio
-                if i < len(file_paths) - 1: # Add small silence between segments
-                    combined += AudioSegment.silent(duration=200)
             else:
-                print(f"⚠️ File not found: {file_path}")
         abs_output_path = os.path.abspath(output_path)
         combined.export(abs_output_path, format="wav")
-        print(f"✅ Merged file saved: {abs_output_path}")
         return True, f"Merged file saved: {os.path.basename(abs_output_path)}", abs_output_path
     except Exception as e:
         return False, f"Error merging files: {e}", None
 def create_zip_file(file_paths, zip_name):
@@ -141,30 +177,37 @@ def create_zip_file(file_paths, zip_name):
             for file_path in file_paths:
                 if os.path.exists(file_path):
                     zipf.write(file_path, os.path.basename(file_path))
-        print(f"📦 ZIP file created: {abs_zip_name}")
         return True, f"ZIP file created: {os.path.basename(abs_zip_name)}", abs_zip_name
     except Exception as e:
         return False, f"Error creating ZIP file: {e}", None
-# --- Main generation function (modified for Gradio) ---
 def generate_audio_for_gradio(
-    api_key_input,
-    use_file_input_checkbox, text_file_obj,  # text_file_obj is from gr.File
     speech_prompt_input, text_to_speak_input,
     max_chunk_slider, sleep_slider, temperature_slider,
     model_dropdown, speaker_dropdown, output_filename_base_input,
-    merge_checkbox, delete_partials_checkbox
 ):
     status_messages = []
-    print("🚀 Starting Text-to-Speech process...")
     status_messages.append("🚀 Starting Text-to-Speech process...")
-    # 1. API Key Validation
-    if not api_key_input:
-        status_messages.append("❌ Error: Gemini API Key is required.")
         return None, None, "\n".join(status_messages)
-    os.environ["GEMINI_API_KEY"] = api_key_input
-    status_messages.append("🔑 API Key set.")
     # 2. Determine Text Input
     actual_text_input = ""
@@ -187,6 +230,7 @@ def generate_audio_for_gradio(
     # 3. Initialize GenAI Client
     try:
         status_messages.append("🛠️ Initializing Gemini client...")
         client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
         status_messages.append("✅ Gemini client initialized.")
     except Exception as e:
@@ -196,22 +240,25 @@ def generate_audio_for_gradio(
     # 4. Split text
     text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
     status_messages.append(f"📊 Text split into {len(text_chunks)} chunk(s).")
-    for i, chunk in enumerate(text_chunks):
-        status_messages.append(f"  📝 Chunk {i+1}: {len(chunk)} chars")
     # 5. Generate audio for each chunk
     generated_audio_files = []
-    # Create a unique temp directory for this run to avoid conflicts
     run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
     temp_output_dir = f"temp_audio_{run_id}"
     os.makedirs(temp_output_dir, exist_ok=True)
-    output_base_name_safe = re.sub(r'\W+', '_', output_filename_base_input) # Sanitize filename
-    for i, chunk in enumerate(text_chunks):
-        status_messages.append(f"\n🔊 Generating audio for chunk {i+1}/{len(text_chunks)}...")
-        final_text = f'"{speech_prompt_input}"\n{chunk}' if speech_prompt_input.strip() else chunk
-        contents = [types.Content(role="user", parts=[types.Part.from_text(text=final_text)])]
         generate_content_config = types.GenerateContentConfig(
             temperature=float(temperature_slider),
             response_modalities=["audio"],
@@ -223,244 +270,281 @@ def generate_audio_for_gradio(
         )
         try:
             chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
-            # Save chunks in the temporary directory
             chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
-            for stream_chunk_data in client.models.generate_content_stream(
-                model=model_dropdown, contents=contents, config=generate_content_config,
             ):
-                if (stream_chunk_data.candidates and stream_chunk_data.candidates[0].content and
-                    stream_chunk_data.candidates[0].content.parts and
-                    stream_chunk_data.candidates[0].content.parts[0].inline_data):
-                    inline_data = stream_chunk_data.candidates[0].content.parts[0].inline_data
                     data_buffer = inline_data.data
-                    file_extension = mimetypes.guess_extension(inline_data.mime_type)
-                    if file_extension is None or file_extension == ".bin": # Gemini sometimes returns .bin for mp3/wav
-                        if "mp3" in inline_data.mime_type.lower():
-                            file_extension = ".mp3"
-                        elif "wav" in inline_data.mime_type.lower():
-                             file_extension = ".wav"
-                        else: # default to wav and try conversion if necessary
-                            file_extension = ".wav"
-                            # data_buffer = convert_to_wav(inline_data.data, inline_data.mime_type)
-                    # If model returns audio/mpeg (MP3), pydub needs it as .mp3 to read.
-                    # If it's audio/wav, pydub needs .wav.
-                    # Let's aim to always save as WAV for consistency if merging.
-                    # For now, save with detected extension.
-                    # If pydub fails to read, it means ffmpeg might not have the right codec or format is unexpected.
                     generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
-                    generated_audio_files.append(generated_file_path)
-                    status_messages.append(f"✅ Chunk {i+1} generated: {os.path.basename(generated_file_path)}")
-                    break # Processed this chunk
-                elif stream_chunk_data.text:
-                     status_messages.append(f"ℹ️ API Text Message: {stream_chunk_data.text}")
         except Exception as e:
-            status_messages.append(f"❌ Error generating chunk {i+1}: {e}")
-            # Consider if we should stop or continue
-            continue # Continue to next chunk
-        if i < len(text_chunks) - 1:
             status_messages.append(f"⏱️ Waiting {sleep_slider}s...")
             time.sleep(float(sleep_slider))
     # 6. Handle output files
     if not generated_audio_files:
-        status_messages.append("❌ No audio files were generated!")
-        return None, None, "\n".join(status_messages)
     status_messages.append(f"\n🎉 {len(generated_audio_files)} audio file(s) generated!")
-    output_audio_path = None
-    output_download_path = None # This will be for ZIP or single audio
     if merge_checkbox and len(generated_audio_files) > 1:
         if not PYDUB_AVAILABLE:
             status_messages.append("⚠️ pydub not available. Cannot merge. Returning ZIP of parts.")
-            success, msg, zip_path = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
-            status_messages.append(msg)
-            if success:
-                output_download_path = zip_path
-            # No single audio player if zipped
         else:
             merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
-            success, msg, merged_path = merge_audio_files_func(generated_audio_files, merged_filename_path)
-            status_messages.append(msg)
-            if success:
-                output_audio_path = merged_path
-                output_download_path = merged_path # User can download the merged file
                 if delete_partials_checkbox:
                     status_messages.append("🗑️ Deleting partial files...")
                     for file_p in generated_audio_files:
-                        try:
-                            os.remove(file_p)
-                            status_messages.append(f"  🗑️ Deleted: {os.path.basename(file_p)}")
-                        except Exception as e_del:
-                            status_messages.append(f"  ⚠️ Could not delete {os.path.basename(file_p)}: {e_del}")
-            else: # Merge failed, provide ZIP
                 status_messages.append("⚠��� Merge failed. Providing ZIP of parts.")
                 success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
                 status_messages.append(msg_zip)
-                if success_zip:
-                    output_download_path = zip_p
     elif len(generated_audio_files) == 1:
-        output_audio_path = generated_audio_files[0]
-        output_download_path = generated_audio_files[0]
-        status_messages.append(f"🎵 Single audio file generated: {os.path.basename(output_audio_path)}")
     else: # Multiple files, no merge requested
         status_messages.append("📦 Multiple parts generated. Creating ZIP file.")
-        success, msg, zip_path = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
-        status_messages.append(msg)
-        if success:
-            output_download_path = zip_path
-        # No single audio player if zipped
     final_status = "\n".join(status_messages)
     print(final_status)
-    return output_audio_path, output_download_path, final_status
 # --- Gradio Interface Definition ---
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("# 🎵 Gemini Text-to-Speech UI 🗣️")
     gr.Markdown(
         "Convert text to speech using Google's Gemini API. "
-        "You need your own [Gemini API Key](https://aistudio.google.com/app/apikey)."
     )
     with gr.Row():
-        with gr.Column(scale=1):
-            api_key = gr.Textbox(
-                label="🔑 Gemini API Key",
-                type="password",
-                placeholder="Enter your Gemini API Key here"
-            )
-            use_file = gr.Checkbox(label="📁 Use Text File Input", value=False)
             text_file = gr.File(
-                label="Upload Text File (.txt)",
                 file_types=['.txt'],
                 visible=False # Initially hidden
             )
             text_to_speak = gr.Textbox(
-                label="📝 Text to Speak",
-                lines=7,
-                placeholder="Enter text here if not using a file...",
                 visible=True # Initially visible
             )
-            # Dynamic visibility for text input vs file input
             use_file.change(
                 lambda x: (gr.update(visible=x), gr.update(visible=not x)),
                 [use_file],
                 [text_file, text_to_speak]
             )
             speech_prompt = gr.Textbox(
                 label="🗣️ Speech Prompt (Optional)",
                 placeholder="e.g., 'As an energetic YouTuber speaking to an audience'",
                 info="Influences style, emotion, and voice characteristics."
             )
-            output_filename_base = gr.Textbox(
-                label="💾 Output Filename Base",
-                value="gemini_tts_output",
-                info="Base name for generated files (no extension)."
-            )
         with gr.Column(scale=1):
             model_name = gr.Dropdown(
-                MODELS,
-                label="🤖 Model",
-                value=MODELS[0]
             )
             speaker_voice = gr.Dropdown(
-                SPEAKER_VOICES,
-                label="🎤 Speaker Voice",
-                value="Charon"
             )
             temperature = gr.Slider(
-                minimum=0.0, maximum=2.0, step=0.05, value=1.0,
                 label="🌡️ Temperature",
-                info="Controls randomness. Higher values (e.g., 1.0) for more variation, lower for more deterministic."
             )
             max_chunk_size = gr.Slider(
-                minimum=2000, maximum=4000, step=100, value=3800,
                 label="🧩 Max Characters per Chunk",
-                info="Text is split into chunks for API requests."
             )
             sleep_between_requests = gr.Slider(
-                minimum=5, maximum=20, step=0.5, value=14, # Adjusted min for Gemini API
-                label="⏱️ Sleep Between Requests (seconds)",
-                info="Helps manage API rate limits for multiple chunks."
             )
-            merge_audio = gr.Checkbox(label="🔗 Merge Audio Chunks", value=True, visible=PYDUB_AVAILABLE)
-            delete_partials = gr.Checkbox(label="🗑️ Delete Partial Files After Merge", value=False, visible=PYDUB_AVAILABLE)
-            # Dynamic visibility for delete_partials based on merge_audio
-            if PYDUB_AVAILABLE:
                 merge_audio.change(lambda x: gr.update(visible=x), [merge_audio], [delete_partials])
-    submit_button = gr.Button("✨ Generate Audio ✨", variant="primary")
     with gr.Row():
         with gr.Column(scale=1):
-            output_audio_player = gr.Audio(label="🎧 Generated Audio Output", type="filepath")
         with gr.Column(scale=1):
             output_file_download = gr.File(label="📥 Download Output File", type="filepath")
-    status_textbox = gr.Textbox(label="📊 Status Log", lines=10, interactive=False)
-    # Connect button to the function
     submit_button.click(
         fn=generate_audio_for_gradio,
         inputs=[
-            api_key, use_file, text_file, speech_prompt, text_to_speak,
             max_chunk_size, sleep_between_requests, temperature,
             model_name, speaker_voice, output_filename_base,
-            merge_audio, delete_partials
         ],
         outputs=[output_audio_player, output_file_download, status_textbox]
     )
     gr.Markdown("---")
-    gr.Markdown(f"Created by aigolden - pydub available: {PYDUB_AVAILABLE}")
     # The encoded text part:
-    encoded_text = "Q3JlYXRlIGJ5IDogYWlnb2xkZW4="
     try:
-        decoded_text = base64.b64decode(encoded_text.encode()).decode()
-        gr.Markdown(f"<div style='text-align:center;'><sub>{decoded_text}</sub></div>")
-    except:
         pass
-    # Example Usage (if needed, for testing locally)
     gr.Examples(
         examples=[
-            [ # API Key, use_file, text_file, speech_prompt, text_to_speak, chunk_size, sleep, temp, model, speaker, output_base, merge, delete
-                "YOUR_API_KEY_HERE", False, None, "A friendly and informative narrator.", "Hello world, this is a test of the Gemini text to speech API using Gradio.", 3800, 10, 1.0, MODELS[0], "Charon", "example_hello", True, False
-            ],
-             [
-                "YOUR_API_KEY_HERE", False, None, "An excited news anchor.", "Breaking news! Artificial intelligence can now generate human-like speech with incredible clarity. This opens up a world of possibilities for content creation and accessibility.", 3000, 12, 0.9, MODELS[1], "Achernar", "example_news", True, True
-            ]
         ],
-        inputs=[
-            api_key, use_file, text_file, speech_prompt, text_to_speak,
             max_chunk_size, sleep_between_requests, temperature,
             model_name, speaker_voice, output_filename_base,
             merge_audio, delete_partials
         ],
         outputs=[output_audio_player, output_file_download, status_textbox],
-        fn=generate_audio_for_gradio, # Cache examples can be slow for API calls
-        cache_examples=False # Set to True if inputs are static and fn is pure
     )
 if __name__ == "__main__":
     if not PYDUB_AVAILABLE:
         print("WARNING: pydub library is not installed or working. Audio file merging will be disabled.")
-        print("Please install it (`pip install pydub`) and ensure ffmpeg is in your system's PATH.")
-    demo.launch(debug=True) # debug=True for local testing, Hugging Face handles this automatically

 from google import genai
 from google.genai import types
+# Attempt to load API key from Hugging Face Secrets
+# The Space's runtime will inject this environment variable if the secret is set.
+HF_GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY")
 try:
     from pydub import AudioSegment
     PYDUB_AVAILABLE = True
 except ImportError:
     PYDUB_AVAILABLE = False
+    print("⚠️ pydub is not available. Audio file merging will be disabled.")
+    print("If merging is desired, ensure pydub is in requirements.txt and ffmpeg is available in the environment.")
+# --- Constants ---
 SPEAKER_VOICES = [
     "Achird", "Zubenelgenubi", "Vindemiatrix", "Sadachbia", "Sadaltager", "Sulafat",
     "Laomedeia", "Achernar", "Alnilam", "Schedar", "Gacrux", "Pulcherrima",
 ]
 MODELS = ["gemini-2.5-flash-preview-tts", "gemini-2.5-pro-preview-tts"]
+# --- Helper functions ---
 def save_binary_file(file_name, data):
     abs_file_name = os.path.abspath(file_name)
+    try:
+        with open(abs_file_name, "wb") as f:
+            f.write(data)
+        print(f"✅ File saved at: {abs_file_name}")
+        return abs_file_name
+    except Exception as e:
+        print(f"❌ Error saving file {abs_file_name}: {e}")
+        return None
 def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
     parameters = parse_audio_mime_type(mime_type)
     bits_per_sample = parameters["bits_per_sample"]
     sample_rate = parameters["rate"]
+    num_channels = 1 # Gemini TTS seems to output mono
     data_size = len(audio_data)
     bytes_per_sample = bits_per_sample // 8
     block_align = num_channels * bytes_per_sample
     byte_rate = sample_rate * block_align
+    chunk_size = 36 + data_size # Size of the 'fmt ' and 'data' chunks and their headers
     header = struct.pack(
         "<4sI4s4sIHHIIHH4sI",
+        b"RIFF", chunk_size, b"WAVE", b"fmt ", 16,  # 16 for PCM
+        1,  # PCM format
+        num_channels, sample_rate, byte_rate, block_align, bits_per_sample,
+        b"data", data_size
     )
     return header + audio_data
 def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
+    bits_per_sample = 16  # Default
+    rate = 24000          # Default for Gemini TTS
+    if mime_type:
+        mime_type_lower = mime_type.lower()
+        parts = mime_type_lower.split(";")
+        for param in parts:
+            param = param.strip()
+            if param.startswith("rate="):
+                try:
+                    rate_str = param.split("=", 1)[1]
+                    rate = int(rate_str)
+                except (ValueError, IndexError): pass
+            elif param.startswith("audio/l"): # e.g., audio/L16 or audio/L24
+                try:
+                    # Attempt to parse bits from "L<bits>"
+                    potential_bits = param.split("l", 1)[1]
+                    if potential_bits.isdigit():
+                         bits_per_sample = int(potential_bits)
+                except (ValueError, IndexError): pass
     return {"bits_per_sample": bits_per_sample, "rate": rate}
 def load_text_from_gr_file(file_obj):
     if file_obj is None:
         return "", "No file provided for text input."
     try:
             content = f.read().strip()
         if not content:
             return "", "Text file is empty."
         return content, f"Successfully loaded {len(content)} chars from {os.path.basename(file_obj.name)}."
     except Exception as e:
         return "", f"Error reading text file: {e}"
         return [text]
     chunks = []
     current_chunk = ""
+    sentences = re.split(r'(?<=[.!?])\s+', text) # Split by sentences
     for sentence in sentences:
+        if not sentence: continue
+        # If adding the current sentence exceeds max_size
+        if len(current_chunk) + len(sentence) + 1 > max_size: # +1 for space
+            if current_chunk: # If there's something in current_chunk, add it
                 chunks.append(current_chunk.strip())
+                current_chunk = "" # Reset current_chunk
+            # If the sentence itself is too long, split it by words or even characters
             if len(sentence) > max_size:
+                words = sentence.split(' ')
+                temp_sentence_part = ""
+                for word in words:
+                    if len(temp_sentence_part) + len(word) + 1 > max_size:
+                        if temp_sentence_part: chunks.append(temp_sentence_part.strip())
+                        # If word itself is too long (rare for TTS practical limits)
+                        if len(word) > max_size:
+                            for i in range(0, len(word), max_size):
+                                chunks.append(word[i:i+max_size])
+                            temp_sentence_part = ""
+                        else:
+                            temp_sentence_part = word
+                    else:
+                        temp_sentence_part += (" " if temp_sentence_part else "") + word
+                if temp_sentence_part: chunks.append(temp_sentence_part.strip())
+                # current_chunk remains empty as the long sentence was fully processed
+            else: # Sentence is not too long itself, start a new chunk with it
                 current_chunk = sentence
+        else: # Sentence fits, add to current_chunk
             current_chunk += (" " if current_chunk else "") + sentence
+    if current_chunk: # Add any remaining part
         chunks.append(current_chunk.strip())
     return chunks
 def merge_audio_files_func(file_paths, output_path):
     if not PYDUB_AVAILABLE:
         return False, "pydub is not available. Cannot merge files.", None
+    if not file_paths:
+        return False, "No audio files to merge.", None
     try:
         combined = AudioSegment.empty()
         for i, file_path in enumerate(file_paths):
             if os.path.exists(file_path):
+                try:
+                    # Explicitly state format if known, otherwise pydub tries to guess
+                    # Assuming all inputs are WAV due to our conversion logic
+                    audio = AudioSegment.from_file(file_path, format="wav")
+                    combined += audio
+                    if i < len(file_paths) - 1:
+                        combined += AudioSegment.silent(duration=200) # Small silence
+                except Exception as e_load:
+                    print(f"⚠️ Error loading audio file {file_path} with pydub: {e_load}")
+                    return False, f"Error loading audio file {os.path.basename(file_path)}: {e_load}", None
             else:
+                print(f"⚠️ File not found for merging: {file_path}")
+                # Decide if this is critical; for now, we'll say it is.
+                return False, f"File not found for merging: {os.path.basename(file_path)}", None
         abs_output_path = os.path.abspath(output_path)
         combined.export(abs_output_path, format="wav")
         return True, f"Merged file saved: {os.path.basename(abs_output_path)}", abs_output_path
     except Exception as e:
+        print(f"❌ Error merging files: {e}")
         return False, f"Error merging files: {e}", None
 def create_zip_file(file_paths, zip_name):
             for file_path in file_paths:
                 if os.path.exists(file_path):
                     zipf.write(file_path, os.path.basename(file_path))
         return True, f"ZIP file created: {os.path.basename(abs_zip_name)}", abs_zip_name
     except Exception as e:
         return False, f"Error creating ZIP file: {e}", None
+# --- Main generation function (modified for Gradio & HF Secrets) ---
 def generate_audio_for_gradio(
+    # api_key_input_field is removed, will use HF_GEMINI_API_KEY
+    use_file_input_checkbox, text_file_obj,
     speech_prompt_input, text_to_speak_input,
     max_chunk_slider, sleep_slider, temperature_slider,
     model_dropdown, speaker_dropdown, output_filename_base_input,
+    merge_checkbox, delete_partials_checkbox,
+    # Progress for Gradio (optional but good for long tasks)
+    progress=gr.Progress(track_tqdm=True)
 ):
     status_messages = []
     status_messages.append("🚀 Starting Text-to-Speech process...")
+    progress(0, desc="Initializing...")
+    # 1. API Key Validation (from HF Secrets)
+    api_key_to_use = HF_GEMINI_API_KEY
+    if not api_key_to_use:
+        # Fallback if user provides one in a field (though we removed the field)
+        # This part can be removed if you *only* want to use secrets
+        # For now, let's assume if HF_GEMINI_API_KEY is None, we raise an error.
+        status_messages.append("❌ Error: GEMINI_API_KEY not found in Hugging Face Secrets.")
+        status_messages.append("➡️ Please set it in your Space's Settings > Secrets.")
         return None, None, "\n".join(status_messages)
+    os.environ["GEMINI_API_KEY"] = api_key_to_use # Set for genai library
+    status_messages.append("🔑 API Key loaded from Secrets.")
     # 2. Determine Text Input
     actual_text_input = ""
     # 3. Initialize GenAI Client
     try:
         status_messages.append("🛠️ Initializing Gemini client...")
+        progress(0.1, desc="Initializing Gemini Client...")
         client = genai.Client(api_key=os.environ.get("GEMINI_API_KEY"))
         status_messages.append("✅ Gemini client initialized.")
     except Exception as e:
     # 4. Split text
     text_chunks = smart_text_split(actual_text_input, int(max_chunk_slider))
     status_messages.append(f"📊 Text split into {len(text_chunks)} chunk(s).")
+    for i, chunk_text in enumerate(text_chunks): # Renamed 'chunk' to 'chunk_text'
+        status_messages.append(f"  📝 Chunk {i+1}: {len(chunk_text)} chars")
     # 5. Generate audio for each chunk
     generated_audio_files = []
     run_id = base64.urlsafe_b64encode(os.urandom(6)).decode()
     temp_output_dir = f"temp_audio_{run_id}"
     os.makedirs(temp_output_dir, exist_ok=True)
+    output_base_name_safe = re.sub(r'[\s\\\/\:\*\?\"\<\>\|\%]+', '_', output_filename_base_input) # More robust sanitize
+    total_chunks = len(text_chunks)
+    for i, chunk_text_content in enumerate(text_chunks):
+        progress_val = 0.1 + (0.7 * (i / total_chunks)) # Progress from 10% to 80% during generation
+        progress(progress_val, desc=f"Generating chunk {i+1}/{total_chunks}...")
+        status_messages.append(f"\n🔊 Generating audio for chunk {i+1}/{total_chunks}...")
+        final_text_for_api = f'"{speech_prompt_input}"\n{chunk_text_content}' if speech_prompt_input.strip() else chunk_text_content
+        contents_for_api = [types.Content(role="user", parts=[types.Part.from_text(text=final_text_for_api)])]
         generate_content_config = types.GenerateContentConfig(
             temperature=float(temperature_slider),
             response_modalities=["audio"],
         )
         try:
             chunk_filename_base = f"{output_base_name_safe}_part_{i+1:03d}"
             chunk_filepath_prefix = os.path.join(temp_output_dir, chunk_filename_base)
+            audio_data_received = False
+            for stream_response_chunk in client.models.generate_content_stream(
+                model=model_dropdown, contents=contents_for_api, config=generate_content_config,
             ):
+                if (stream_response_chunk.candidates and stream_response_chunk.candidates[0].content and
+                    stream_response_chunk.candidates[0].content.parts and
+                    stream_response_chunk.candidates[0].content.parts[0].inline_data):
+                    inline_data = stream_response_chunk.candidates[0].content.parts[0].inline_data
                     data_buffer = inline_data.data
+                    api_mime_type = inline_data.mime_type
+                    audio_data_received = True
+                    status_messages.append(f"ℹ️ API returned MIME type: {api_mime_type}")
+                    # Determine file extension and convert if necessary
+                    file_extension = ".wav" # Default to .wav and convert
+                    if api_mime_type and ("mp3" in api_mime_type.lower() or "mpeg" in api_mime_type.lower()):
+                        file_extension = ".mp3"
+                        # For MP3, data_buffer is already MP3. No conversion needed for saving.
+                        # pydub will need ffmpeg to read MP3 for merging.
+                        status_messages.append(f"ℹ️ Saving as MP3 based on MIME: {api_mime_type}")
+                    elif api_mime_type and "wav" in api_mime_type.lower() and \
+                         not ("audio/l16" in api_mime_type.lower() or "audio/l24" in api_mime_type.lower()):
+                        file_extension = ".wav"
+                        # API says WAV and it's not raw L16/L24, trust it.
+                        status_messages.append(f"ℹ️ Saving as WAV based on MIME: {api_mime_type}")
+                    else: # Raw PCM (like audio/L16), unknown, or .bin -> convert to WAV
+                        file_extension = ".wav"
+                        status_messages.append(f"ℹ️ Converting to WAV for MIME: {api_mime_type or 'Unknown'}")
+                        data_buffer = convert_to_wav(data_buffer, api_mime_type)
+                    status_messages.append(f"ℹ️ Determined file extension: {file_extension}")
                     generated_file_path = save_binary_file(f"{chunk_filepath_prefix}{file_extension}", data_buffer)
+                    if generated_file_path:
+                        generated_audio_files.append(generated_file_path)
+                        status_messages.append(f"✅ Chunk {i+1} saved: {os.path.basename(generated_file_path)}")
+                    else:
+                        status_messages.append(f"❌ Failed to save chunk {i+1}.")
+                    break # Processed this audio data from stream
+                elif stream_response_chunk.text:
+                     status_messages.append(f"ℹ️ API Text Message (during stream): {stream_response_chunk.text}")
+            if not audio_data_received:
+                status_messages.append(f"❌ No audio data received in stream for chunk {i+1}.")
+                # Check for errors in the stream response if available
+                if stream_response_chunk and stream_response_chunk.prompt_feedback and stream_response_chunk.prompt_feedback.block_reason:
+                    status_messages.append(f"🛑 API Block Reason: {stream_response_chunk.prompt_feedback.block_reason_message or stream_response_chunk.prompt_feedback.block_reason}")
+        except types.BlockedPromptException as bpe:
+            status_messages.append(f"❌ Content blocked for chunk {i+1}: {bpe}")
+            status_messages.append(f"  Feedback: {bpe.response.prompt_feedback}")
+        except types.StopCandidateException as sce:
+            status_messages.append(f"❌ Generation stopped for chunk {i+1}: {sce}")
+            status_messages.append(f"  Feedback: {sce.response.prompt_feedback}")
         except Exception as e:
+            status_messages.append(f"❌ Error generating/processing chunk {i+1}: {e}")
+            import traceback
+            status_messages.append(traceback.format_exc()) # More detailed error
+            continue
+        if i < total_chunks - 1:
             status_messages.append(f"⏱️ Waiting {sleep_slider}s...")
             time.sleep(float(sleep_slider))
+    progress(0.85, desc="Processing generated files...")
     # 6. Handle output files
     if not generated_audio_files:
+        status_messages.append("❌ No audio files were successfully generated or saved!")
+        final_status = "\n".join(status_messages)
+        print(final_status)
+        progress(1, desc="Finished with errors.")
+        return None, None, final_status
     status_messages.append(f"\n🎉 {len(generated_audio_files)} audio file(s) generated!")
+    output_audio_path_for_player = None # For gr.Audio, ideally a single WAV
+    output_path_for_download = None    # For gr.File, can be WAV or ZIP
     if merge_checkbox and len(generated_audio_files) > 1:
         if not PYDUB_AVAILABLE:
             status_messages.append("⚠️ pydub not available. Cannot merge. Returning ZIP of parts.")
+            success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
+            status_messages.append(msg_zip)
+            if success_zip: output_path_for_download = zip_p
         else:
+            status_messages.append(f"🔗 Merging {len(generated_audio_files)} files (all should be WAVs now)...")
+            # Ensure all files for merging are WAV, convert if any MP3s were saved and pydub is used
+            # For simplicity, our save logic now tries to make them WAV if not MP3 from API.
+            # If an MP3 was saved and PYDUB_AVAILABLE, it should handle it.
             merged_filename_path = os.path.join(temp_output_dir, f"{output_base_name_safe}_merged.wav")
+            success_merge, msg_merge, merged_p = merge_audio_files_func(generated_audio_files, merged_filename_path)
+            status_messages.append(msg_merge)
+            if success_merge:
+                output_audio_path_for_player = merged_p
+                output_path_for_download = merged_p
                 if delete_partials_checkbox:
                     status_messages.append("🗑️ Deleting partial files...")
                     for file_p in generated_audio_files:
+                        try: os.remove(file_p); status_messages.append(f"  🗑️ Deleted: {os.path.basename(file_p)}")
+                        except Exception as e_del: status_messages.append(f"  ⚠️ Could not delete {os.path.basename(file_p)}: {e_del}")
+            else:
                 status_messages.append("⚠��� Merge failed. Providing ZIP of parts.")
                 success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
                 status_messages.append(msg_zip)
+                if success_zip: output_path_for_download = zip_p
     elif len(generated_audio_files) == 1:
+        # Single file, should be WAV due to our conversion logic or MP3 if API sent that
+        single_file_path = generated_audio_files[0]
+        if single_file_path.lower().endswith(".mp3") and PYDUB_AVAILABLE:
+            # Convert MP3 to WAV for Gradio player if it prefers WAV
+            # Or, gr.Audio might handle MP3 directly. Let's test.
+            # For now, assume gr.Audio handles common types.
+            output_audio_path_for_player = single_file_path
+            status_messages.append(f"🎵 Single MP3 file: {os.path.basename(single_file_path)}")
+        else: # Assume WAV
+             output_audio_path_for_player = single_file_path
+             status_messages.append(f"🎵 Single WAV file: {os.path.basename(single_file_path)}")
+        output_path_for_download = single_file_path
     else: # Multiple files, no merge requested
         status_messages.append("📦 Multiple parts generated. Creating ZIP file.")
+        success_zip, msg_zip, zip_p = create_zip_file(generated_audio_files, os.path.join(temp_output_dir, f"{output_base_name_safe}_all_parts.zip"))
+        status_messages.append(msg_zip)
+        if success_zip: output_path_for_download = zip_p
     final_status = "\n".join(status_messages)
     print(final_status)
+    print(f"DEBUG: output_audio_path_for_player: {output_audio_path_for_player}")
+    print(f"DEBUG: output_path_for_download: {output_path_for_download}")
+    progress(1, desc="Finished!")
+    return output_audio_path_for_player, output_path_for_download, final_status
 # --- Gradio Interface Definition ---
+with gr.Blocks(theme=gr.themes.Soft(primary_hue=gr.themes.colors.blue, secondary_hue=gr.themes.colors.sky)) as demo:
     gr.Markdown("# 🎵 Gemini Text-to-Speech UI 🗣️")
+    if not HF_GEMINI_API_KEY:
+        gr.Warning(
+            "GEMINI_API_KEY not found in Hugging Face Secrets. "
+            "Please add it in your Space's 'Settings' > 'Secrets' tab for the app to work. "
+            "Name the secret `GEMINI_API_KEY`."
+        )
+    else:
+        gr.Info("Gemini API Key loaded successfully from Space Secrets. Ready to generate!")
     gr.Markdown(
         "Convert text to speech using Google's Gemini API. "
+        "Your Gemini API Key must be set as a Secret named `GEMINI_API_KEY` in this Space's settings."
+        "\n\nGet your API Key from [Google AI Studio](https://aistudio.google.com/app/apikey)."
     )
     with gr.Row():
+        with gr.Column(scale=2): # Wider column for text inputs
+            use_file = gr.Checkbox(label="📁 Use Text File Input (.txt)", value=False)
             text_file = gr.File(
+                label="Upload Text File", # Simpler label
                 file_types=['.txt'],
                 visible=False # Initially hidden
             )
             text_to_speak = gr.Textbox(
+                label="📝 Text to Speak (or use file above)",
+                lines=10,
+                placeholder="Enter text here...",
                 visible=True # Initially visible
             )
             use_file.change(
                 lambda x: (gr.update(visible=x), gr.update(visible=not x)),
                 [use_file],
                 [text_file, text_to_speak]
             )
             speech_prompt = gr.Textbox(
                 label="🗣️ Speech Prompt (Optional)",
                 placeholder="e.g., 'As an energetic YouTuber speaking to an audience'",
                 info="Influences style, emotion, and voice characteristics."
             )
         with gr.Column(scale=1):
             model_name = gr.Dropdown(
+                MODELS, label="🤖 Model", value=MODELS[0]
             )
             speaker_voice = gr.Dropdown(
+                SPEAKER_VOICES, label="🎤 Speaker Voice", value="Charon"
             )
             temperature = gr.Slider(
+                minimum=0.0, maximum=1.0, step=0.05, value=0.7, # Gemini TTS often uses temp <= 1
                 label="🌡️ Temperature",
+                info="Controls randomness (0.0-1.0). Higher for more variation."
             )
             max_chunk_size = gr.Slider(
+                minimum=1000, maximum=4000, step=100, value=3800,
                 label="🧩 Max Characters per Chunk",
+                info="Text is split for API. Max 4096 per request for some models."
             )
             sleep_between_requests = gr.Slider(
+                minimum=1, maximum=15, step=0.5, value=2, # Reduced default sleep
+                label="⏱️ Sleep Between Chunks (sec)",
+                info="Helps manage API rate limits (e.g. Gemini Flash has 60 RPM limit)."
             )
+            output_filename_base = gr.Textbox(
+                label="💾 Output Filename Base", value="gemini_tts_audio"
+            )
+            with gr.Group(visible=PYDUB_AVAILABLE):
+                merge_audio = gr.Checkbox(label="🔗 Merge Audio Chunks (if >1)", value=True)
+                delete_partials = gr.Checkbox(label="🗑️ Delete Chunks After Merge", value=True, visible=True) # Default visible
                 merge_audio.change(lambda x: gr.update(visible=x), [merge_audio], [delete_partials])
+            if not PYDUB_AVAILABLE:
+                 gr.Markdown("<small>⚠️ Merging disabled: `pydub` library not found. Install if needed.</small>")
+    submit_button = gr.Button("✨ Generate Audio ✨", variant="primary", scale=2) # Centered button
     with gr.Row():
         with gr.Column(scale=1):
+            output_audio_player = gr.Audio(label="🎧 Generated Audio Output", type="filepath", format="wav") # Specify format if known
         with gr.Column(scale=1):
             output_file_download = gr.File(label="📥 Download Output File", type="filepath")
+    status_textbox = gr.Textbox(label="📊 Status Log", lines=10, interactive=False, max_lines=20)
     submit_button.click(
         fn=generate_audio_for_gradio,
         inputs=[
+            use_file, text_file, speech_prompt, text_to_speak,
             max_chunk_size, sleep_between_requests, temperature,
             model_name, speaker_voice, output_filename_base,
+            merge_audio, delete_partials # Even if not visible, pass them
         ],
         outputs=[output_audio_player, output_file_download, status_textbox]
     )
     gr.Markdown("---")
     # The encoded text part:
+    encoded_text = "Q3JlYXRlIGJ5IDogYWlnb2xkZW4=" # "Created by : aigolden"
     try:
+        decoded_text = base64.b64decode(encoded_text.encode('utf-8')).decode('utf-8')
+        gr.Markdown(f"<p style='text-align:center; font-size:small;'><em>{decoded_text}</em></p>")
+    except Exception as e_decode:
+        print(f"Error decoding/displaying credit: {e_decode}")
         pass
     gr.Examples(
         examples=[
+            [False, None, "A friendly and informative narrator.", "Hello world, this is a test of the Gemini text to speech API using Gradio. I hope this works well!", 3800, 2, 0.7, MODELS[0], "Charon", "example_hello", True, True],
+            [False, None, "An excited news reporter.", "Breaking news! Artificial intelligence can now generate human-like speech. This technology is rapidly evolving!", 3000, 2, 0.8, MODELS[1], "Achernar", "example_news", True, True],
+            [True, "sample_text.txt", "A calm storyteller.", "", 3500, 3, 0.6, MODELS[0], "Vindemiatrix", "example_from_file", True, False]
         ],
+        fn=generate_audio_for_gradio, # Ensure example fn is the same as main
+        inputs=[ # Ensure these match the function's inputs exactly (order and number)
+            use_file, text_file, speech_prompt, text_to_speak,
             max_chunk_size, sleep_between_requests, temperature,
             model_name, speaker_voice, output_filename_base,
             merge_audio, delete_partials
         ],
         outputs=[output_audio_player, output_file_download, status_textbox],
+        cache_examples=False # API calls, so don't cache results based on static inputs
     )
+    gr.Markdown("<small>To use the 'example_from_file', please create a `sample_text.txt` file in the root of this Space with some text content, or upload your own text file.</small>")
 if __name__ == "__main__":
     if not PYDUB_AVAILABLE:
         print("WARNING: pydub library is not installed or working. Audio file merging will be disabled.")
+    if not HF_GEMINI_API_KEY:
+        print("WARNING: GEMINI_API_KEY environment variable not set. The app might not work in local if it relies on this for API key.")
+    # For local testing, you might want to provide a way to input the API key
+    # or set the GEMINI_API_KEY environment variable before running.
+    # e.g., export GEMINI_API_KEY="your_key_here"
+    # then run python app.py
+    demo.launch(debug=True, share=False) # share=False for local, HF Spaces handles public link