import os import logging import time import soundfile as sf from kokoro import KPipeline logger = logging.getLogger(__name__) class TTSEngine: def __init__(self, lang_code='z'): """Initialize TTS Engine with Kokoro Args: lang_code (str): Language code ('a' for US English, 'b' for British English, 'j' for Japanese, 'z' for Mandarin Chinese) """ logger.info("Initializing TTS Engine") self.pipeline = KPipeline(lang_code=lang_code) logger.info("TTS engine initialized") def generate_speech(self, text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: """Generate speech from text using Kokoro Args: text (str): Input text to synthesize voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) speed (float): Speech speed multiplier (0.5 to 2.0) Returns: str: Path to the generated audio file """ logger.info(f"Generating speech for text length: {len(text)}") try: # Create output directory if it doesn't exist os.makedirs("temp/outputs", exist_ok=True) # Generate unique output path output_path = f"temp/outputs/output_{int(time.time())}.wav" # Get the first generated segment # We only take the first segment since the original code handled single segments generator = self.pipeline(text, voice=voice, speed=speed) for _, _, audio in generator: logger.info(f"Saving audio to {output_path}") sf.write(output_path, audio, 24000) break logger.info(f"Audio generation complete: {output_path}") return output_path except Exception as e: logger.error(f"TTS generation failed: {str(e)}", exc_info=True) raise def generate_speech_stream(self, text: str, voice: str = 'af_heart', speed: float = 1.0): """Generate speech from text and yield each segment Args: text (str): Input text to synthesize voice (str): Voice ID to use (e.g., 'af_heart', 'af_bella', etc.) speed (float): Speech speed multiplier (0.5 to 2.0) Yields: tuple: (sample_rate, audio_data) pairs for each segment """ try: generator = self.pipeline(text, voice=voice, speed=speed) for _, _, audio in generator: yield 24000, audio except Exception as e: logger.error(f"TTS streaming failed: {str(e)}", exc_info=True) raise # Initialize TTS engine with cache decorator if using Streamlit def get_tts_engine(lang_code='a'): """Get or create TTS engine instance Args: lang_code (str): Language code for the pipeline Returns: TTSEngine: Initialized TTS engine instance """ try: import streamlit as st @st.cache_resource def _get_engine(): return TTSEngine(lang_code) return _get_engine() except ImportError: return TTSEngine(lang_code) def generate_speech(text: str, voice: str = 'af_heart', speed: float = 1.0) -> str: """Public interface for TTS generation Args: text (str): Input text to synthesize voice (str): Voice ID to use speed (float): Speech speed multiplier Returns: str: Path to generated audio file """ engine = get_tts_engine() return engine.generate_speech(text, voice, speed)