import gradio as gr import speech_recognition as sr from time import time import threading from pydub import AudioSegment from pydub.playback import play import io # Global variables is_recording = False start_beep = AudioSegment.silent(duration=200).append(AudioSegment.from_wav(io.BytesIO(b''), crossfade=100) end_beep = AudioSegment.silent(duration=200).append(AudioSegment.from_wav(io.BytesIO(b'')), crossfade=100) def play_start_sound(): try: play(start_beep) except: pass def play_end_sound(): try: play(end_beep) except: pass def start_recording(audio_time_limit): global is_recording is_recording = True recognizer = sr.Recognizer() microphone = sr.Microphone() play_start_sound() with microphone as source: recognizer.adjust_for_ambient_noise(source, duration=0.5) try: audio = recognizer.listen(source, timeout=3, phrase_time_limit=audio_time_limit) text = recognizer.recognize_google(audio) return text except sr.WaitTimeoutError: return "" except sr.UnknownValueError: return "" except Exception as e: print(f"Error: {str(e)}") return "" finally: play_end_sound() is_recording = False def transcribe_audio(audio_time_limit=10): def execute_recording(): nonlocal result result = start_recording(audio_time_limit) result = "" recording_thread = threading.Thread(target=execute_recording) recording_thread.start() start_time = time() while is_recording and (time() - start_time) < audio_time_limit: time_elapsed = time() - start_time time_left = max(0, audio_time_limit - time_elapsed) progress = 1 - (time_left / audio_time_limit) yield {"__type__": "update", "value": f"🎤 Recording... {time_left:.1f}s left", "visible": True}, {"__type__": "update", "value": "", "visible": True} gr.sleep(0.1) recording_thread.join() yield {"__type__": "update", "value": "✅ Done!", "visible": True}, {"__type__": "update", "value": result, "visible": True} def create_ui(): css = """ .mic-button { background: linear-gradient(45deg, #FF3366, #BA265D) !important; border: none !important; color: white !important; padding: 12px !important; border-radius: 50% !important; height: 50px !important; width: 50px !important; margin-left: 10px !important; } .mic-button:hover { transform: scale(1.05) !important; } .input-with-mic { display: flex !important; align-items: center !important; gap: 10px !important; } .status-message { font-style: italic; color: #666; margin-top: 5px; } """ with gr.Blocks(css=css) as demo: gr.Markdown("## 🎤 Speech to Text Converter") with gr.Group(): with gr.Row(): text_input = gr.Textbox( label="Your Input", placeholder="Click the mic button and speak...", elem_classes=["input-box"], scale=9 ) mic_button = gr.Button( "🎤", elem_classes=["mic-button"], scale=1 ) status_display = gr.Textbox( label="Status", visible=False, interactive=False, elem_classes=["status-message"] ) mic_button.click( fn=transcribe_audio, inputs=[gr.Slider(5, 30, value=10, label="Recording time limit (seconds)")], outputs=[status_display, text_input], show_progress="hidden" ) gr.Examples( examples=["Hello world", "How are you today?", "Please convert my speech to text"], inputs=text_input, label="Try these examples:" ) return demo if __name__ == "__main__": demo = create_ui() demo.launch(debug=True)