File size: 2,221 Bytes
0455eb3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
from pynput import keyboard
import transformers
from transformers import WhisperFeatureExtractor, WhisperForConditionalGeneration
import soundfile as sf

# Define voice, speed, and pitch variables (initial values)
voice = "en"  # English (change for other voices)
speed = 1.0
pitch = 1.0

# Initialize feature extractor and model from Hugging Face Transformers
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper")
model = WhisperForConditionalGeneration.from_pretrained("openai/whisper")


def on_press(key):
    global voice, speed, pitch

    # Handle special keys (consider modifications for your needs)
    if key == keyboard.Key.esc:
        return False  # Exit keystroke listener

    pressed_key = str(key).replace("'", "")

    # Text pre-processing (optional, customize for specific needs)
    text = f"{pressed_key}"

    # Encode text
    inputs = feature_extractor(text, return_tensors="pt")

    # Generate speech tokens with adjusted voice, speed, and pitch
    generation = model.generate(
        **inputs, voice=voice, speed=speed, pitch=pitch
    )

    # Decode tokens to text (for debugging purposes)
    # decoded_text = feature_extractor.decode(generation, skip_special_tokens=True)

    # Convert generated tokens to audio waveform
    audio_output = model.to_audio(generation)

    # Play the audio (replace with your preferred audio playback library)
    sf.write("output.wav", audio_output, samplerate=16000)
    st.audio("output.wav", format="audio/wav")


# Streamlit App

st.title("Text-to-Speech Keystroke Announcer")

# User Interface for customization options
voice_selected = st.selectbox("Voice", ["en", "fr", "es"])  # Add more options
speed_slider = st.slider("Speaking Speed", min_value=0.5, max_value=2.0, value=1.0)
pitch_slider = st.slider("Speaking Pitch", min_value=0.5, max_value=2.0, value=1.0)

# Update variables based on user selections
voice = voice_selected
speed = speed_slider
pitch = pitch_slider

# Start keystroke listener on button press
if st.button("Start Keystroke Announcer"):
    with keyboard.Listener(on_press=on_press) as listener:
        listener.join()

st.write("Press 'Esc' to stop keystroke detection.")