Spaces:

stevenhillis
/

intone_mvp

Sleeping

File size: 2,911 Bytes

d86f106
 
 
 
 
 
 
 
 
 
2ae17a3
d86f106
2ae17a3
d86f106
 
2ae17a3
 
d86f106
 
 
 
2ae17a3
 
 
 
 
d86f106
 
2ae17a3
d86f106
 
 
 
 
 
 
 
2ae17a3
d86f106
2ae17a3
 
 
d86f106
2ae17a3
 
 
d86f106
 
 
 
2ae17a3

import json
import os
import requests

import gradio as gr
import numpy as np


base_url = "https://api.sandbox.deepgram.com/nlu"
token_str = os.environ['DG_TOKEN']
def tts_fn(text, prompt_audio, pitch_steps, inference_steps, inference_temperature):
    texts = [text]
    prompt_audio = np.reshape(prompt_audio[1], (1, 1, -1)).astype(np.float32, order='C') / 32768.0
    response = requests.post(
        f'{base_url}', 
        files=[('texts', ('texts', json.dumps(texts), 'application/json')), ('prompt_audio', ('prompt_audio', json.dumps(prompt_audio.tolist()), 'application/json'))], 
        params={'synthesize': 'true', 'pitch_steps': int(pitch_steps), 'soundstorm_steps': inference_steps, 'temperature': inference_temperature},
        headers={
            'Authorization': f'Token {token_str}'
        },
    ).json()
    try:
        sample_rate = int(response['results'][0]['sample_rate'])
        audio = (np.array(response['results'][0]['audio']).transpose() * 32767).astype(np.int16)
    except Exception:
        print(response)
    return (sample_rate, audio)

demo_files = ['demo_files/man.wav', 'demo_files/woman.wav', 'demo_files/man_2.wav', 'demo_files/woman_2.wav', 'demo_files/meditation.wav']

app = gr.Blocks()

with app:
    with gr.Tab("TTS MVP"):
        with gr.Row():
            with gr.Column():
                pangram = "The beige hue on the waters of the loch impressed all, including the French queen, before she heard that symphony again, just as young Arthur wanted."
                cherry = "Your request has been processed and the audio is ready for playback."
                textbox = gr.TextArea(label="Text", placeholder="Type a sentence here", value=cherry)
                prompt_audio = gr.Audio(label="Prompt Audio (first 3 seconds of selection)", source='upload')
                examples = gr.Examples(label='Sample Speakers', examples=demo_files, inputs=prompt_audio)
                # speed = gr.Slider(minimum=0.0, maximum=2.0, value=1.1, step=0.1, label="Speed")
                pitch_steps = gr.Slider(minimum=-24, maximum=24, value=0, step=1, label="Pitch Steps: 12 to an octave")
                # variability = gr.Slider(minimum=0.0, maximum=1.0, value=0.7, step=0.1, label="Variability")
                inference_steps = gr.Slider(minimum=1, maximum=32, value=1, step=1, label="Inference Steps: quality vs latency tradeoff. Results are sometimes unstable for values >1.")
                inference_temperature = gr.Slider(minimum=0.0, maximum=1.0, value=0.9, step=0.05, label="Temperature: fidelity vs variability tradeoff")

            with gr.Column():
                audio_output = gr.Audio(label="Output Audio", elem_id='tts-audio')
                btn = gr.Button("Generate")
                btn.click(tts_fn, inputs=[textbox, prompt_audio, pitch_steps, inference_steps, inference_temperature], outputs=[audio_output])
app.launch(share=True)