Spaces:

BryanBradfo
/

KokAudio

Running

App Files Files Community

BryanBradfo commited on Apr 2

Commit

c5dd1ce

1 Parent(s): 2f67766

voicebloom

Browse files

Files changed (1) hide show

app.py +150 -103

app.py CHANGED Viewed

@@ -1,57 +1,94 @@
-import spaces
-from kokoro import KModel, KPipeline
-import gradio as gr
 import os
 import random
 import torch
-CUDA_AVAILABLE = False  # Set to False to disable GPU usage
-models = {gpu: KModel().to('cuda' if gpu else 'cpu').eval() for gpu in [False]}
 pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
-def generate_first(text, voice='af_heart', speed=1):
-    pipeline = pipelines[voice[0]]
-    pack = pipeline.load_voice(voice)
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps)-1]
-        audio = models[False](ps, ref_s, speed)
-        return (24000, audio.numpy()), ps
-    return None, ''
-def tokenize_first(text, voice='af_heart'):
-    pipeline = pipelines[voice[0]]
-    for _, ps, _ in pipeline(text, voice):
-        return ps
-    return ''
-def generate_all(text, voice='af_heart', speed=1):
-    pipeline = pipelines[voice[0]]
-    pack = pipeline.load_voice(voice)
-    first = True
-    for _, ps, _ in pipeline(text, voice, speed):
-        ref_s = pack[len(ps)-1]
-        audio = models[False](ps, ref_s, speed)
-        yield 24000, audio.numpy()
-        if first:
-            first = False
-            yield 24000, torch.zeros(1).numpy()
-# Wisdom quotes
 QUOTES = [
     "When it is obvious that the goals cannot be reached, don't adjust the goals, adjust the action steps. - Confucius",
     "The man who moves a mountain begins by carrying away small stones. - Confucius",
     "Life is really simple, but we insist on making it complicated. - Confucius",
     "It does not matter how slowly you go as long as you do not stop. - Confucius",
     "Eating fruit daily provides essential vitamins, minerals and fiber that help maintain good health.",
     "An apple a day keeps the doctor away - a simple habit with profound health benefits.",
     "Fruits are nature's candy - sweet, nutritious, and vital for our wellbeing.",
     "Regular consumption of fruits boosts your immune system and reduces risk of chronic diseases.",
     "Colorful fruits on your plate mean a rainbow of nutrients for your body.",
-    "The wisdom of health lies in eating seasonal fruits that nature provides us."
 ]
 def get_random_quote():
     return random.choice(QUOTES)
@@ -77,79 +114,89 @@ Eating seasonal fruits connects us with nature's rhythm and ensures we get the f
 Remember: 'Nature's first green is gold' - and nowhere is this more true than in the vibrant colors of fruit that nourish our bodies daily."""
-CHOICES = {
-'🇺🇸 🚺 Heart ❤️': 'af_heart',
-'🇺🇸 🚺 Bella 🔥': 'af_bella',
-'🇺🇸 🚺 Nicole 🎧': 'af_nicole',
-'🇺🇸 🚺 Aoede': 'af_aoede',
-'🇺🇸 🚺 Kore': 'af_kore',
-'🇺🇸 🚺 Sarah': 'af_sarah',
-'🇺🇸 🚹 Michael': 'am_michael',
-'🇺🇸 🚹 Fenrir': 'am_fenrir',
-'🇺🇸 🚹 Puck': 'am_puck',
-'🇺🇸 🚹 Echo': 'am_echo',
-'🇬🇧 🚺 Emma': 'bf_emma',
-'🇬🇧 🚺 Isabella': 'bf_isabella',
-'🇬🇧 🚹 George': 'bm_george',
-'🇬🇧 🚹 Fable': 'bm_fable',
-}
-for v in CHOICES.values():
-    pipelines[v[0]].load_voice(v)
-TOKEN_NOTE = '''
-💡 Customize pronunciation with Markdown link syntax and /slashes/ like `[Kokoro](/kˈOkəɹO/)`
-💬 To adjust intonation, try punctuation `;:,.!?—…"()""` or stress `ˈ` and `ˌ`
-⬇️ Lower stress `[1 level](-1)` or `[2 levels](-2)`
-⬆️ Raise stress 1 level `[or](+2)` 2 levels (only works on less stressed, usually short words)
-'''
-with gr.Blocks() as generate_tab:
-    out_audio = gr.Audio(label='Output Audio', interactive=False, streaming=False, autoplay=True)
-    generate_btn = gr.Button('Generate', variant='primary')
-    with gr.Accordion('Output Tokens', open=True):
-        out_ps = gr.Textbox(interactive=False, show_label=False, info='Tokens used to generate the audio, up to 510 context length.')
-        tokenize_btn = gr.Button('Tokenize', variant='secondary')
-        gr.Markdown(TOKEN_NOTE)
-STREAM_NOTE = ['⚠️ There is an unknown Gradio bug that might yield no audio the first time you click `Stream`.']
-STREAM_NOTE = '\n\n'.join(STREAM_NOTE)
-with gr.Blocks() as stream_tab:
-    out_stream = gr.Audio(label='Output Audio Stream', interactive=False, streaming=True, autoplay=True)
-    with gr.Row():
-        stream_btn = gr.Button('Stream', variant='primary')
-        stop_btn = gr.Button('Stop', variant='stop')
-    with gr.Accordion('Note', open=True):
-        gr.Markdown(STREAM_NOTE)
-BANNER_TEXT = '''
-# Kokoro TTS - Wisdom Speaker
-This demo uses the open-weight Kokoro TTS model to convert wisdom quotes into speech.
-'''
-with gr.Blocks() as app:
-    with gr.Row():
-        gr.Markdown(BANNER_TEXT, container=True)
     with gr.Row():
         with gr.Column():
-            text = gr.Textbox(label='Input Text', info="Up to ~500 characters per Generate")
-            with gr.Row():
-                voice = gr.Dropdown(list(CHOICES.items()), value='af_heart', label='Voice', info='Select from various voices')
-            speed = gr.Slider(minimum=0.5, maximum=2, value=1, step=0.1, label='Speed')
-            random_btn = gr.Button('🎲 Random Wisdom Quote 💬', variant='secondary')
-            with gr.Row():
-                confucius_btn = gr.Button('🧙 Confucius Wisdom 📜', variant='secondary')
-                fruit_btn = gr.Button('🍎 Fruit Wisdom 🍊', variant='secondary')
         with gr.Column():
-            gr.TabbedInterface([generate_tab, stream_tab], ['Generate', 'Stream'])
-    random_btn.click(fn=get_random_quote, inputs=[], outputs=[text])
-    confucius_btn.click(fn=get_confucius, inputs=[], outputs=[text])
-    fruit_btn.click(fn=get_fruit_wisdom, inputs=[], outputs=[text])
-    generate_btn.click(fn=generate_first, inputs=[text, voice, speed], outputs=[out_audio, out_ps])
-    tokenize_btn.click(fn=tokenize_first, inputs=[text, voice], outputs=[out_ps])
-    stream_event = stream_btn.click(fn=generate_all, inputs=[text, voice, speed], outputs=[out_stream])
     stop_btn.click(fn=None, cancels=stream_event)
-if __name__ == '__main__':
-    app.queue().launch()

 import os
 import random
 import torch
+from kokoro import KModel, KPipeline
+import gradio as gr
+# App configuration
+APP_TITLE = "✨ VoiceBloom ✨"
+APP_SUBTITLE = "Transform wisdom into delightful speech!"
+THEME = gr.themes.Soft(
+    primary_hue="indigo",
+    secondary_hue="purple",
+).set(
+    body_background_fill="linear-gradient(to right top, #d16ba5, #c777b9, #ba83ca, #aa8fd8, #9a9ae1, #8aa7ec, #79b3f4, #69bff8, #52cffe, #41dfff, #46eefa, #5ffbf1)",
+    button_primary_background_fill="linear-gradient(90deg, rgba(255,124,0,1) 0%, rgba(255,194,23,1) 100%)",
+    button_primary_background_fill_hover="linear-gradient(90deg, rgba(255,194,23,1) 0%, rgba(255,124,0,1) 100%)",
+    button_secondary_background_fill="linear-gradient(90deg, rgba(144,95,255,1) 0%, rgba(110,72,220,1) 100%)",
+    button_secondary_background_fill_hover="linear-gradient(90deg, rgba(110,72,220,1) 0%, rgba(144,95,255,1) 100%)",
+    block_background_fill="rgba(255, 255, 255, 0.8)",
+    block_shadow="0px 4px 12px rgba(0, 0, 0, 0.1)",
+    block_radius="12px",
+)
+# Initialize TTS models without GPU
+models = {False: KModel().to('cpu').eval()}
 pipelines = {lang_code: KPipeline(lang_code=lang_code, model=False) for lang_code in 'ab'}
 pipelines['a'].g2p.lexicon.golds['kokoro'] = 'kˈOkəɹO'
 pipelines['b'].g2p.lexicon.golds['kokoro'] = 'kˈQkəɹQ'
+# Define voice options with fun emoji indicators
+VOICES = {
+    '✨ Sarah (US) ✨': 'af_sarah',
+    '🌟 Nicole (US) 🌟': 'af_nicole',
+    '💖 Heart (US) 💖': 'af_heart',
+    '🔮 Bella (US) 🔮': 'af_bella',
+    '🌈 Aoede (US) 🌈': 'af_aoede',
+    '🎵 Michael (US) 🎵': 'am_michael',
+    '🌠 Echo (US) 🌠': 'am_echo',
+    '🧙 Fenrir (US) 🧙': 'am_fenrir',
+    '🎭 Puck (US) 🎭': 'am_puck',
+    '👑 Emma (UK) 👑': 'bf_emma',
+    '🌹 Isabella (UK) 🌹': 'bf_isabella',
+    '🎩 George (UK) 🎩': 'bm_george',
+    '✨ Fable (UK) ✨': 'bm_fable',
+}
+# Preload voices
+for v in VOICES.values():
+    pipelines[v[0]].load_voice(v)
+# Inspirational quotes
 QUOTES = [
     "When it is obvious that the goals cannot be reached, don't adjust the goals, adjust the action steps. - Confucius",
     "The man who moves a mountain begins by carrying away small stones. - Confucius",
     "Life is really simple, but we insist on making it complicated. - Confucius",
     "It does not matter how slowly you go as long as you do not stop. - Confucius",
+    "Our greatest glory is not in never falling, but in rising every time we fall. - Confucius",
+    "Silence is a true friend who never betrays. - Confucius",
     "Eating fruit daily provides essential vitamins, minerals and fiber that help maintain good health.",
     "An apple a day keeps the doctor away - a simple habit with profound health benefits.",
     "Fruits are nature's candy - sweet, nutritious, and vital for our wellbeing.",
     "Regular consumption of fruits boosts your immune system and reduces risk of chronic diseases.",
     "Colorful fruits on your plate mean a rainbow of nutrients for your body.",
+    "The wisdom of health lies in eating seasonal fruits that nature provides us.",
+    "A journey of a thousand miles begins with a single step.",
+    "Happiness is not something ready-made. It comes from your own actions.",
+    "The best time to plant a tree was 20 years ago. The second best time is now.",
 ]
+def generate_audio(text, voice, speed):
+    """Generate audio from text using the selected voice and speed"""
+    pipeline = pipelines[voice[0]]
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps)-1]
+        audio = models[False](ps, ref_s, speed)
+        return (24000, audio.numpy())
+    return None
+def stream_audio(text, voice, speed):
+    """Stream audio for longer texts"""
+    pipeline = pipelines[voice[0]]
+    pack = pipeline.load_voice(voice)
+    for _, ps, _ in pipeline(text, voice, speed):
+        ref_s = pack[len(ps)-1]
+        audio = models[False](ps, ref_s, speed)
+        yield 24000, audio.numpy()
 def get_random_quote():
     return random.choice(QUOTES)
 Remember: 'Nature's first green is gold' - and nowhere is this more true than in the vibrant colors of fruit that nourish our bodies daily."""
+# Build the UI
+with gr.Blocks(theme=THEME) as app:
+    gr.Markdown(f"# {APP_TITLE}")
+    gr.Markdown(f"### {APP_SUBTITLE}")
     with gr.Row():
         with gr.Column():
+            # Input section with colorful styling
+            with gr.Box():
+                gr.Markdown("### 📝 What would you like me to say?")
+                text_input = gr.Textbox(
+                    label="",
+                    placeholder="Type your text here or choose a wisdom quote below...",
+                    lines=5
+                )
+                with gr.Row():
+                    random_btn = gr.Button("🎲 Random Quote", variant="secondary")
+                    confucius_btn = gr.Button("🧙 Confucius Wisdom", variant="secondary")
+                    fruit_btn = gr.Button("🍎 Fruit Wisdom", variant="secondary")
+            # Voice & Speed controls
+            with gr.Box():
+                gr.Markdown("### 🎤 Choose Your Voice")
+                voice_dropdown = gr.Dropdown(
+                    list(VOICES.items()),
+                    value=list(VOICES.items())[0][1],
+                    label="Voice Style"
+                )
+                gr.Markdown("### ⏱️ Adjust Speaking Speed")
+                speed_slider = gr.Slider(
+                    minimum=0.5,
+                    maximum=2.0,
+                    value=1.0,
+                    step=0.1,
+                    label="Speed"
+                )
+        # Output section
         with gr.Column():
+            with gr.Tabs():
+                with gr.TabItem("🔊 Listen"):
+                    audio_output = gr.Audio(
+                        label="Your VoiceBloom Creation",
+                        interactive=False,
+                        autoplay=True
+                    )
+                    generate_btn = gr.Button("🎵 Generate Audio", variant="primary", size="lg")
+                with gr.TabItem("📻 Stream Longer Text"):
+                    stream_output = gr.Audio(
+                        label="Streaming Audio",
+                        interactive=False,
+                        streaming=True,
+                        autoplay=True
+                    )
+                    with gr.Row():
+                        stream_btn = gr.Button("▶️ Start Streaming", variant="primary")
+                        stop_btn = gr.Button("⏹️ Stop", variant="stop")
+    # Footer
+    gr.Markdown("---")
+    gr.Markdown("### ✨ VoiceBloom - Bringing wisdom to life through the art of voice ✨")
+    # Set up event handlers
+    random_btn.click(fn=get_random_quote, outputs=[text_input])
+    confucius_btn.click(fn=get_confucius, outputs=[text_input])
+    fruit_btn.click(fn=get_fruit_wisdom, outputs=[text_input])
+    generate_btn.click(
+        fn=generate_audio,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[audio_output]
+    )
+    stream_event = stream_btn.click(
+        fn=stream_audio,
+        inputs=[text_input, voice_dropdown, speed_slider],
+        outputs=[stream_output]
+    )
     stop_btn.click(fn=None, cancels=stream_event)
+if __name__ == "__main__":
+    app.launch()