"""Main entry point for the Audio Translation Web Application using Gradio Handles file upload, processing pipeline, and UI rendering """ import logging logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', handlers=[ logging.FileHandler("app.log"), logging.StreamHandler() ] ) logger = logging.getLogger(__name__) import gradio as gr import os import time import numpy as np import soundfile as sf from utils.stt import transcribe_audio from utils.translation import translate_text from utils.tts import get_tts_engine, generate_speech # Initialize environment configurations os.makedirs("temp/uploads", exist_ok=True) os.makedirs("temp/outputs", exist_ok=True) # CSS for styling the Gradio interface css = """ .gradio-container { max-width: 1200px; margin: 0 auto; } .output-text { font-family: monospace; padding: 10px; background-color: #f5f5f5; border-radius: 4px; } """ def handle_file_processing(audio_file): """ Execute the complete processing pipeline: 1. Speech-to-Text (STT) 2. Machine Translation 3. Text-to-Speech (TTS) Args: audio_file: Tuple containing (sample_rate, audio_data) Returns: Tuple containing (english_text, chinese_text, output_audio) """ logger.info("Starting processing for uploaded audio") try: # Save the uploaded audio to a temporary file sr, audio_data = audio_file temp_path = os.path.join("temp/uploads", f"upload_{time.time()}.wav") sf.write(temp_path, audio_data, sr) logger.info(f"Saved uploaded audio to {temp_path}") # STT Phase logger.info("Beginning STT processing") english_text = transcribe_audio(temp_path) logger.info(f"STT completed. Text length: {len(english_text)} characters") # Translation Phase logger.info("Beginning translation") chinese_text = translate_text(english_text) logger.info(f"Translation completed. Translated length: {len(chinese_text)} characters") # TTS Phase logger.info("Beginning TTS generation") # Initialize TTS engine with appropriate language code for Chinese engine = get_tts_engine(lang_code='z') # 'z' for Mandarin Chinese # Generate speech and get the file path output_path = engine.generate_speech(chinese_text, voice="zf_xiaobei") logger.info(f"TTS completed. Output file: {output_path}") # Load the generated audio for Gradio output audio_data, sr = sf.read(output_path) return english_text, chinese_text, (sr, audio_data) except Exception as e: logger.error(f"Processing failed: {str(e)}", exc_info=True) raise gr.Error(f"Processing Failed: {str(e)}") def stream_audio(chinese_text, voice, speed): """ Stream audio in chunks for the Gradio interface Args: chinese_text: The Chinese text to convert to speech voice: The voice to use speed: The speech speed factor Returns: Generator yielding audio chunks """ engine = get_tts_engine(lang_code='z') # Stream the audio in chunks for sample_rate, audio_chunk in engine.generate_speech_stream( chinese_text, voice=voice, speed=speed ): # Create a temporary file for each chunk temp_chunk_path = f"temp/outputs/chunk_{time.time()}.wav" sf.write(temp_chunk_path, audio_chunk, sample_rate) # Load the chunk for Gradio output chunk_data, sr = sf.read(temp_chunk_path) # Clean up the temporary chunk file os.remove(temp_chunk_path) yield (sr, chunk_data) def create_interface(): """ Create and configure the Gradio interface Returns: Gradio Blocks interface """ with gr.Blocks(css=css) as interface: gr.Markdown("# 🎧 High-Quality Audio Translation System") gr.Markdown("Upload English Audio → Get Chinese Speech Output") with gr.Row(): with gr.Column(scale=2): # File upload component audio_input = gr.Audio( label="Upload English Audio", type="numpy", sources=["upload", "microphone"] ) # Process button process_btn = gr.Button("Process Audio", variant="primary") with gr.Column(scale=1): # TTS Settings with gr.Box(): gr.Markdown("### TTS Settings") voice_dropdown = gr.Dropdown( choices=["Xiaobei (Female)", "Yunjian (Male)"], value="Xiaobei (Female)", label="Select Voice" ) speed_slider = gr.Slider( minimum=0.5, maximum=2.0, value=1.0, step=0.1, label="Speech Speed" ) # Output section with gr.Row(): with gr.Column(scale=2): # Text outputs english_output = gr.Textbox( label="Recognition Results", lines=5, elem_classes=["output-text"] ) chinese_output = gr.Textbox( label="Translation Results", lines=5, elem_classes=["output-text"] ) with gr.Column(scale=1): # Audio output audio_output = gr.Audio( label="Audio Output", type="numpy" ) # Stream button stream_btn = gr.Button("Stream Audio") # Download button is automatically provided by gr.Audio # Set up event handlers process_btn.click( fn=handle_file_processing, inputs=[audio_input], outputs=[english_output, chinese_output, audio_output] ) # Map voice selection to actual voice IDs def get_voice_id(voice_name): voice_map = { "Xiaobei (Female)": "zf_xiaobei", "Yunjian (Male)": "zm_yunjian" } return voice_map.get(voice_name, "zf_xiaobei") # Stream button handler stream_btn.click( fn=lambda text, voice, speed: stream_audio(text, get_voice_id(voice), speed), inputs=[chinese_output, voice_dropdown, speed_slider], outputs=audio_output ) # Examples gr.Examples( examples=[ ["examples/sample1.mp3"], ["examples/sample2.wav"] ], inputs=audio_input ) return interface def main(): """ Main application entry point """ logger.info("Starting Gradio application") interface = create_interface() interface.launch() if __name__ == "__main__": main()