Futuresony commited on
Commit
18433e4
·
verified ·
1 Parent(s): 555da4a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +64 -239
app.py CHANGED
@@ -1,240 +1,65 @@
1
-
2
  import gradio as gr
3
- from huggingface_hub import snapshot_download
4
- from threading import Thread
5
- import time
6
- import base64
7
- import numpy as np
8
- import requests
9
- import traceback
10
- from dataclasses import dataclass, field
11
- import io
12
- from pydub import AudioSegment
13
- import librosa
14
- from utils.vad import get_speech_timestamps, collect_chunks, VadOptions
15
- import tempfile
16
-
17
-
18
- from server import serve
19
-
20
- repo_id = "Futuresony/future_ai_12_10_2024.gguf"
21
- snapshot_download(repo_id, local_dir="./checkpoint", revision="main")
22
-
23
- IP = "0.0.0.0"
24
- PORT = 60808
25
-
26
- thread = Thread(target=serve, daemon=True)
27
- thread.start()
28
-
29
- API_URL = "http://0.0.0.0:60808/chat"
30
-
31
- # recording parameters
32
- IN_CHANNELS = 1
33
- IN_RATE = 24000
34
- IN_CHUNK = 1024
35
- IN_SAMPLE_WIDTH = 2
36
- VAD_STRIDE = 0.5
37
-
38
- # playing parameters
39
- OUT_CHANNELS = 1
40
- OUT_RATE = 24000
41
- OUT_SAMPLE_WIDTH = 2
42
- OUT_CHUNK = 5760
43
-
44
-
45
- OUT_CHUNK = 20 * 4096
46
- OUT_RATE = 24000
47
- OUT_CHANNELS = 1
48
-
49
-
50
- def run_vad(ori_audio, sr):
51
- _st = time.time()
52
- try:
53
- audio = ori_audio
54
- audio = audio.astype(np.float32) / 32768.0
55
- sampling_rate = 16000
56
- if sr != sampling_rate:
57
- audio = librosa.resample(audio, orig_sr=sr, target_sr=sampling_rate)
58
-
59
- vad_parameters = {}
60
- vad_parameters = VadOptions(**vad_parameters)
61
- speech_chunks = get_speech_timestamps(audio, vad_parameters)
62
- audio = collect_chunks(audio, speech_chunks)
63
- duration_after_vad = audio.shape[0] / sampling_rate
64
-
65
- if sr != sampling_rate:
66
- # resample to original sampling rate
67
- vad_audio = librosa.resample(audio, orig_sr=sampling_rate, target_sr=sr)
68
- else:
69
- vad_audio = audio
70
- vad_audio = np.round(vad_audio * 32768.0).astype(np.int16)
71
- vad_audio_bytes = vad_audio.tobytes()
72
-
73
- return duration_after_vad, vad_audio_bytes, round(time.time() - _st, 4)
74
- except Exception as e:
75
- msg = f"[asr vad error] audio_len: {len(ori_audio)/(sr*2):.3f} s, trace: {traceback.format_exc()}"
76
- print(msg)
77
- return -1, ori_audio, round(time.time() - _st, 4)
78
-
79
-
80
- def warm_up():
81
- frames = b"\x00\x00" * 1024 * 2 # 1024 frames of 2 bytes each
82
- dur, frames, tcost = run_vad(frames, 16000)
83
- print(f"warm up done, time_cost: {tcost:.3f} s")
84
-
85
-
86
- warm_up()
87
-
88
-
89
- @dataclass
90
- class AppState:
91
- stream: np.ndarray | None = None
92
- sampling_rate: int = 0
93
- pause_detected: bool = False
94
- started_talking: bool = False
95
- stopped: bool = False
96
- conversation: list = field(default_factory=list)
97
-
98
-
99
- def determine_pause(audio: np.ndarray, sampling_rate: int, state: AppState) -> bool:
100
- """Take in the stream, determine if a pause happened"""
101
-
102
- temp_audio = audio
103
-
104
- dur_vad, _, time_vad = run_vad(temp_audio, sampling_rate)
105
- duration = len(audio) / sampling_rate
106
-
107
- if dur_vad > 0.5 and not state.started_talking:
108
- print("started talking")
109
- state.started_talking = True
110
- return False
111
-
112
- print(f"duration_after_vad: {dur_vad:.3f} s, time_vad: {time_vad:.3f} s")
113
-
114
- return (duration - dur_vad) > 1
115
-
116
-
117
- def speaking(audio_bytes: str):
118
-
119
- base64_encoded = str(base64.b64encode(audio_bytes), encoding="utf-8")
120
- files = {"audio": base64_encoded}
121
- with requests.post(API_URL, json=files, stream=True) as response:
122
- try:
123
- for chunk in response.iter_content(chunk_size=OUT_CHUNK):
124
- if chunk:
125
- # Create an audio segment from the numpy array
126
- audio_segment = AudioSegment(
127
- chunk,
128
- frame_rate=OUT_RATE,
129
- sample_width=OUT_SAMPLE_WIDTH,
130
- channels=OUT_CHANNELS,
131
- )
132
-
133
- # Export the audio segment to MP3 bytes - use a high bitrate to maximise quality
134
- mp3_io = io.BytesIO()
135
- audio_segment.export(mp3_io, format="mp3", bitrate="320k")
136
-
137
- # Get the MP3 bytes
138
- mp3_bytes = mp3_io.getvalue()
139
- mp3_io.close()
140
- yield mp3_bytes
141
-
142
- except Exception as e:
143
- raise gr.Error(f"Error during audio streaming: {e}")
144
-
145
-
146
-
147
-
148
- def process_audio(audio: tuple, state: AppState):
149
- if state.stream is None:
150
- state.stream = audio[1]
151
- state.sampling_rate = audio[0]
152
- else:
153
- state.stream = np.concatenate((state.stream, audio[1]))
154
-
155
- pause_detected = determine_pause(state.stream, state.sampling_rate, state)
156
- state.pause_detected = pause_detected
157
-
158
- if state.pause_detected and state.started_talking:
159
- return gr.Audio(recording=False), state
160
- return None, state
161
-
162
-
163
- def response(state: AppState):
164
- if not state.pause_detected and not state.started_talking:
165
- return None, AppState()
166
-
167
- audio_buffer = io.BytesIO()
168
-
169
- segment = AudioSegment(
170
- state.stream.tobytes(),
171
- frame_rate=state.sampling_rate,
172
- sample_width=state.stream.dtype.itemsize,
173
- channels=(1 if len(state.stream.shape) == 1 else state.stream.shape[1]),
174
- )
175
- segment.export(audio_buffer, format="wav")
176
-
177
- with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
178
- f.write(audio_buffer.getvalue())
179
-
180
- state.conversation.append({"role": "user",
181
- "content": {"path": f.name,
182
- "mime_type": "audio/wav"}})
183
-
184
- output_buffer = b""
185
-
186
- for mp3_bytes in speaking(audio_buffer.getvalue()):
187
- output_buffer += mp3_bytes
188
- yield mp3_bytes, state
189
-
190
- with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as f:
191
- f.write(output_buffer)
192
-
193
- state.conversation.append({"role": "assistant",
194
- "content": {"path": f.name,
195
- "mime_type": "audio/mp3"}})
196
- yield None, AppState(conversation=state.conversation)
197
-
198
-
199
-
200
-
201
- def start_recording_user(state: AppState):
202
- if not state.stopped:
203
- return gr.Audio(recording=True)
204
-
205
- with gr.Blocks() as demo:
206
- with gr.Row():
207
- with gr.Column():
208
- input_audio = gr.Audio(
209
- label="Input Audio", sources="microphone", type="numpy"
210
- )
211
- with gr.Column():
212
- chatbot = gr.Chatbot(label="Conversation", type="messages")
213
- output_audio = gr.Audio(label="Output Audio", streaming=True, autoplay=True)
214
- state = gr.State(value=AppState())
215
-
216
- stream = input_audio.stream(
217
- process_audio,
218
- [input_audio, state],
219
- [input_audio, state],
220
- stream_every=0.5,
221
- time_limit=30,
222
- )
223
- respond = input_audio.stop_recording(
224
- response,
225
- [state],
226
- [output_audio, state]
227
- )
228
- respond.then(lambda s: s.conversation, [state], [chatbot])
229
-
230
- restart = output_audio.stop(
231
- start_recording_user,
232
- [state],
233
- [input_audio]
234
- )
235
- cancel = gr.Button("Stop Conversation", variant="stop")
236
- cancel.click(lambda: (AppState(stopped=True), gr.Audio(recording=False)), None,
237
- [state, input_audio], cancels=[respond, restart])
238
-
239
-
240
- demo.launch()
 
 
1
  import gradio as gr
2
+ from huggingface_hub import InferenceClient
3
+
4
+ """
5
+ For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
6
+ """
7
+ client = InferenceClient("Futuresony/future_ai_12_10_2024.gguf")
8
+
9
+
10
+ def respond(
11
+ message,
12
+ history: list[tuple[str, str]],
13
+ system_message,
14
+ max_tokens,
15
+ temperature,
16
+ top_p,
17
+ ):
18
+ messages = [{"role": "system", "content": system_message}]
19
+
20
+ for val in history:
21
+ if val[0]:
22
+ messages.append({"role": "user", "content": val[0]})
23
+ if val[1]:
24
+ messages.append({"role": "assistant", "content": val[1]})
25
+
26
+ messages.append({"role": "user", "content": message})
27
+
28
+ response = ""
29
+
30
+ for message in client.chat_completion(
31
+ messages,
32
+ max_tokens=max_tokens,
33
+ stream=True,
34
+ temperature=temperature,
35
+ top_p=top_p,
36
+ ):
37
+ token = message.choices[0].delta.content
38
+
39
+ response += token
40
+ yield response
41
+
42
+
43
+ """
44
+ For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
+ """
46
+ demo = gr.ChatInterface(
47
+ respond,
48
+ additional_inputs=[
49
+ gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
50
+ gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
51
+ gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
52
+ gr.Slider(
53
+ minimum=0.1,
54
+ maximum=1.0,
55
+ value=0.95,
56
+ step=0.05,
57
+ label="Top-p (nucleus sampling)",
58
+ ),
59
+ ],
60
+ )
61
+
62
+
63
+ if __name__ == "__main__":
64
+ demo.launch()
65
+