Anna Sun Justin Haaheim mduppes radames commited on
Commit
2485dd8
·
0 Parent(s):

squash for release

Browse files

Co-authored-by: Justin Haaheim <justinhaaheim@users.noreply.github.com>
Co-authored-by: Mark Duppenthaler <mduppes@gmail.com>
Co-authored-by: Radamés Ajna <radamajna@gmail.com>

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .dockerignore +1 -0
  2. .gitattributes +37 -0
  3. .gitignore +5 -0
  4. Dockerfile +77 -0
  5. README.md +61 -0
  6. seamless_server/.gitignore +4 -0
  7. seamless_server/app_pubsub.py +797 -0
  8. seamless_server/debug/.gitignore +4 -0
  9. seamless_server/models/SeamlessStreaming/vad_s2st_sc_24khz_main.yaml +21 -0
  10. seamless_server/requirements.txt +30 -0
  11. seamless_server/src/room.py +64 -0
  12. seamless_server/src/simuleval_agent_directory.py +150 -0
  13. seamless_server/src/simuleval_transcoder.py +423 -0
  14. seamless_server/src/speech_and_text_output.py +15 -0
  15. seamless_server/src/transcoder_helpers.py +43 -0
  16. seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl +3 -0
  17. streaming-react-app/.eslintrc.cjs +18 -0
  18. streaming-react-app/.gitignore +24 -0
  19. streaming-react-app/README.md +14 -0
  20. streaming-react-app/index.html +13 -0
  21. streaming-react-app/package-lock.json +0 -0
  22. streaming-react-app/package.json +54 -0
  23. streaming-react-app/src/App.tsx +57 -0
  24. streaming-react-app/src/Blink.tsx +41 -0
  25. streaming-react-app/src/DebugSection.tsx +62 -0
  26. streaming-react-app/src/RoomConfig.tsx +262 -0
  27. streaming-react-app/src/SocketWrapper.tsx +218 -0
  28. streaming-react-app/src/StreamingInterface.css +56 -0
  29. streaming-react-app/src/StreamingInterface.tsx +1165 -0
  30. streaming-react-app/src/URLParams.ts +50 -0
  31. streaming-react-app/src/assets/Roboto-msdf.json +0 -0
  32. streaming-react-app/src/assets/Roboto-msdf.png +0 -0
  33. streaming-react-app/src/assets/RobotoMono-Regular-msdf.json +0 -0
  34. streaming-react-app/src/assets/RobotoMono-Regular.png +0 -0
  35. streaming-react-app/src/assets/seamless.svg +6 -0
  36. streaming-react-app/src/createBufferedSpeechPlayer.ts +173 -0
  37. streaming-react-app/src/cursorBlinkInterval.ts +1 -0
  38. streaming-react-app/src/debug.ts +257 -0
  39. streaming-react-app/src/float32To16BitPCM.ts +16 -0
  40. streaming-react-app/src/generateNewRoomID.ts +56 -0
  41. streaming-react-app/src/getParamFlag.ts +39 -0
  42. streaming-react-app/src/getTranslationSentencesFromReceivedData.ts +22 -0
  43. streaming-react-app/src/isScrolledToDocumentBottom.ts +11 -0
  44. streaming-react-app/src/languageLookup.ts +117 -0
  45. streaming-react-app/src/main.tsx +9 -0
  46. streaming-react-app/src/react-xr/ARButton.tsx +89 -0
  47. streaming-react-app/src/react-xr/Button.tsx +117 -0
  48. streaming-react-app/src/react-xr/Colors.ts +6 -0
  49. streaming-react-app/src/react-xr/MovementController.tsx +64 -0
  50. streaming-react-app/src/react-xr/Playground.tsx +133 -0
.dockerignore ADDED
@@ -0,0 +1 @@
 
 
1
+ seamless_server/models/*.pt
.gitattributes ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ *.mp3 filter=lfs diff=lfs merge=lfs -text
37
+ *.whl filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ .vscode/settings.json
2
+ __pycache__/
3
+ *.pt
4
+ *.model
5
+ venv/
Dockerfile ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # build frontend with node
2
+ FROM node:20-alpine AS frontend
3
+ RUN apk add --no-cache libc6-compat
4
+ WORKDIR /app
5
+
6
+ COPY streaming-react-app .
7
+ RUN \
8
+ if [ -f yarn.lock ]; then yarn --frozen-lockfile; \
9
+ elif [ -f package-lock.json ]; then npm ci; \
10
+ elif [ -f pnpm-lock.yaml ]; then yarn global add pnpm && pnpm i --frozen-lockfile; \
11
+ else echo "Lockfile not found." && exit 1; \
12
+ fi
13
+
14
+ RUN npm run build
15
+
16
+ # build backend on CUDA
17
+ FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 AS backend
18
+ WORKDIR /app
19
+
20
+ ENV DEBIAN_FRONTEND=noninteractive
21
+ ENV NODE_MAJOR=20
22
+
23
+ RUN apt-get update && \
24
+ apt-get upgrade -y && \
25
+ apt-get install -y --no-install-recommends \
26
+ git \
27
+ git-lfs \
28
+ wget \
29
+ curl \
30
+ # python build dependencies \
31
+ build-essential \
32
+ libssl-dev \
33
+ zlib1g-dev \
34
+ libbz2-dev \
35
+ libreadline-dev \
36
+ libsqlite3-dev \
37
+ libncursesw5-dev \
38
+ xz-utils \
39
+ tk-dev \
40
+ libxml2-dev \
41
+ libxmlsec1-dev \
42
+ libffi-dev \
43
+ liblzma-dev \
44
+ sox libsox-fmt-all \
45
+ # gradio dependencies \
46
+ ffmpeg \
47
+ # fairseq2 dependencies \
48
+ libsndfile-dev && \
49
+ apt-get clean && \
50
+ rm -rf /var/lib/apt/lists/*
51
+
52
+ RUN useradd -m -u 1000 user
53
+ USER user
54
+ ENV HOME=/home/user \
55
+ PATH=/home/user/.local/bin:$PATH
56
+ WORKDIR $HOME/app
57
+
58
+ RUN curl https://pyenv.run | bash
59
+ ENV PATH=$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH
60
+ ARG PYTHON_VERSION=3.10.12
61
+ RUN pyenv install $PYTHON_VERSION && \
62
+ pyenv global $PYTHON_VERSION && \
63
+ pyenv rehash && \
64
+ pip install --no-cache-dir -U pip setuptools wheel
65
+
66
+ COPY --chown=user:user ./seamless_server ./seamless_server
67
+ # change dir since pip needs to seed whl folder
68
+ RUN cd seamless_server && pip install --no-cache-dir --upgrade -r requirements.txt
69
+ COPY --from=frontend /app/dist ./streaming-react-app/dist
70
+
71
+ WORKDIR $HOME/app/seamless_server
72
+ USER root
73
+ RUN ln -s /usr/lib/x86_64-linux-gnu/libsox.so.3 /usr/lib/x86_64-linux-gnu/libsox.so
74
+ USER user
75
+ CMD [ "uvicorn", "app_pubsub:app", "--host", "0.0.0.0", "--port", "7860" ]
76
+
77
+
README.md ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Seamless Streaming Backend/Frontend
3
+ emoji: 📞
4
+ colorFrom: blue
5
+ colorTo: yellow
6
+ sdk: docker
7
+ pinned: false
8
+ suggested_hardware: t4-medium
9
+ ---
10
+
11
+ # Seamless Streaming demo
12
+ ## Running on HF spaces
13
+ You can simply duplicate the space to run it.
14
+
15
+ ## Running locally
16
+ ### Install backend seamless_server dependencies
17
+
18
+ Please note: we *strongly* recommend running the model on GPU.
19
+
20
+ If running for the first time, create conda environment and install the desired torch version. The example below is for PyTorch 2.1.1 and variant cu118. Check [here](https://pytorch.org/get-started/locally/) to find the command for your variant. Then install the rest of the requirements:
21
+ ```
22
+ cd seamless_server
23
+ conda create --name smlss_server python=3.8
24
+ conda activate smlss_server
25
+ conda install pytorch torchvision torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia
26
+ pip install -r requirements.txt
27
+ conda install -c conda-forge libsndfile==1.0.31
28
+ ```
29
+
30
+ ### Install frontend streaming-react-app dependencies
31
+ ```
32
+ conda install -c conda-forge nodejs
33
+ cd streaming-react-app
34
+ npm install
35
+ npm run build # this will create the dist/ folder
36
+ ```
37
+
38
+
39
+ ### Running the server
40
+
41
+ The server can be run locally with uvicorn below.
42
+ Run the server in dev mode:
43
+
44
+ ```
45
+ cd seamless_server
46
+ uvicorn app_pubsub:app --reload --host localhost
47
+ ```
48
+
49
+ Run the server in prod mode:
50
+
51
+ ```
52
+ cd seamless_server
53
+ uvicorn app_pubsub:app --host 0.0.0.0
54
+ ```
55
+
56
+ To enable additional logging from uvicorn pass `--log-level debug` or `--log-level trace`.
57
+
58
+
59
+ ### Debuging
60
+
61
+ If you enable "Server Debug Flag" when starting streaming from the client, this enables extensive debug logging and it saves audio files in /debug folder.
seamless_server/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ __pycache__/
2
+ src/__pycache__/
3
+ debug/
4
+ .vscode/
seamless_server/app_pubsub.py ADDED
@@ -0,0 +1,797 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from operator import itemgetter
2
+ import os
3
+ from typing import Any, Optional, Tuple, Dict, TypedDict
4
+ from urllib import parse
5
+ from uuid import uuid4
6
+ import colorlog
7
+ import io
8
+ import logging
9
+ from pprint import pformat
10
+ import socketio
11
+ import sys
12
+ import time
13
+ import random
14
+ import string
15
+ from starlette.applications import Starlette
16
+ from starlette.routing import Mount, Route
17
+ from starlette.staticfiles import StaticFiles
18
+
19
+
20
+ from src.room import Room, Member
21
+ from src.simuleval_agent_directory import NoAvailableAgentException
22
+ from src.simuleval_agent_directory import SimulevalAgentDirectory
23
+ from src.simuleval_transcoder import SimulevalTranscoder
24
+ from src.transcoder_helpers import get_transcoder_output_events
25
+
26
+ ###############################################
27
+ # Constants
28
+ ###############################################
29
+
30
+ DEBUG = True
31
+
32
+ ALL_ROOM_ID = "ALL"
33
+
34
+ ROOM_ID_USABLE_CHARACTERS = string.ascii_uppercase
35
+ ROOM_ID_LENGTH = 4
36
+
37
+ ROOM_LISTENERS_SUFFIX = "_listeners"
38
+ ROOM_SPEAKERS_SUFFIX = "_speakers"
39
+
40
+ ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME = "remove_server_lock"
41
+
42
+ ###############################################
43
+ # Configure logger
44
+ ###############################################
45
+
46
+ logger = logging.getLogger("socketio_server_pubsub")
47
+ logger.propagate = False
48
+
49
+ handler = colorlog.StreamHandler(stream=sys.stdout)
50
+
51
+ formatter = colorlog.ColoredFormatter(
52
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
53
+ reset=True,
54
+ log_colors={
55
+ "DEBUG": "cyan",
56
+ "INFO": "green",
57
+ "WARNING": "yellow",
58
+ "ERROR": "red",
59
+ "CRITICAL": "red,bg_white",
60
+ },
61
+ )
62
+
63
+ handler.setFormatter(formatter)
64
+ logger.addHandler(handler)
65
+
66
+ logger.setLevel(logging.WARNING)
67
+
68
+ print("")
69
+ print("")
70
+ print("=" * 20 + " ⭐️ Starting Server... ⭐️ " + "=" * 20)
71
+
72
+ ###############################################
73
+ # Configure socketio server
74
+ ###############################################
75
+
76
+ CLIENT_BUILD_PATH = "../streaming-react-app/dist/"
77
+ static_files = {
78
+ "/": CLIENT_BUILD_PATH,
79
+ "/assets/seamless-db6a2555.svg": {
80
+ "filename": CLIENT_BUILD_PATH + "assets/seamless-db6a2555.svg",
81
+ "content_type": "image/svg+xml",
82
+ },
83
+ }
84
+
85
+ # sio is the main socket.io entrypoint
86
+ sio = socketio.AsyncServer(
87
+ async_mode="asgi",
88
+ cors_allowed_origins="*",
89
+ logger=logger,
90
+ # engineio_logger=logger,
91
+ )
92
+ # sio.logger.setLevel(logging.DEBUG)
93
+ socketio_app = socketio.ASGIApp(sio)
94
+
95
+ app_routes = [
96
+ Mount("/ws", app=socketio_app), # Mount Socket.IO server under /app
97
+ Mount(
98
+ "/", app=StaticFiles(directory=CLIENT_BUILD_PATH, html=True)
99
+ ), # Serve static files from root
100
+ ]
101
+ app = Starlette(debug=True, routes=app_routes)
102
+
103
+ # rooms is indexed by room_id
104
+ rooms: Dict[str, Room] = {}
105
+
106
+
107
+ class MemberDirectoryObject(TypedDict):
108
+ room: Room
109
+ member_object: Member
110
+
111
+
112
+ # member_directory is indexed by client_id
113
+ # NOTE: client_id is really "client session id", meaning that it is unique to a single browser session.
114
+ # If a user opens a new tab, they will have a different client_id and can join another room, join
115
+ # the same room with different roles, etc.
116
+ # NOTE: For a long-running production server we would want to clean up members after a certain timeout
117
+ # but for this limited application we can just keep them around
118
+ member_directory: Dict[str, MemberDirectoryObject] = {}
119
+
120
+
121
+ class ServerLock(TypedDict):
122
+ name: str
123
+ client_id: str
124
+ member_object: Member
125
+
126
+
127
+ server_lock: Optional[ServerLock] = None
128
+
129
+ server_id = str(uuid4())
130
+
131
+ # Specify specific models to load (some environments have issues loading multiple models)
132
+ # See AgentWithInfo with JSON format details.
133
+ models_override = os.environ.get("MODELS_OVERRIDE")
134
+
135
+ available_agents = SimulevalAgentDirectory()
136
+ logger.info("Building and adding agents...")
137
+ if models_override is not None:
138
+ logger.info(f"MODELS_OVERRIDE supplied from env vars: {models_override}")
139
+ available_agents.build_and_add_agents(models_override)
140
+
141
+ agents_capabilities_for_json = available_agents.get_agents_capabilities_list_for_json()
142
+
143
+
144
+ ###############################################
145
+ # Helpers
146
+ ###############################################
147
+
148
+
149
+ def catch_and_log_exceptions_for_sio_event_handlers(func):
150
+ # wrapper should have the same signature as the original function
151
+ async def catch_exception_wrapper(*args, **kwargs):
152
+ try:
153
+ return await func(*args, **kwargs)
154
+ except Exception as e:
155
+ message = f"[app_pubsub] Caught exception in '{func.__name__}' event handler:\n\n{e}"
156
+ logger.exception(message, stack_info=True)
157
+
158
+ try:
159
+ exception_data = {
160
+ "message": message,
161
+ "timeEpochMs": int(time.time() * 1000),
162
+ }
163
+
164
+ try:
165
+ # Let's try to add as much useful metadata as possible to the server_exception event
166
+ sid = args[0]
167
+ if isinstance(sid, str) and len(sid) > 0:
168
+ session_data = await get_session_data(sid)
169
+ if session_data:
170
+ client_id = session_data.get("client_id")
171
+ member = session_data.get("member_object")
172
+ room = session_data.get("room_object")
173
+
174
+ exception_data["room"] = str(room)
175
+ exception_data["member"] = str(member)
176
+ exception_data["clientID"] = str(client_id)
177
+ except Exception as inner_e:
178
+ # We expect there will be times when clientID or other values aren't present, so just log this as a warning
179
+ logger.warn(
180
+ f"[app_pubsub] Caught exception while trying add additional_data to server_exception:\n\n{inner_e}"
181
+ )
182
+
183
+ # For now let's emit this to all clients. We ultimatley may want to emit it just to the room it's happening in.
184
+ await sio.emit("server_exception", exception_data)
185
+ except Exception as inner_e:
186
+ logger.exception(
187
+ f"[app_pubsub] Caught exception while trying to emit server_exception event:\n{inner_e}"
188
+ )
189
+
190
+ # Re-raise the exception so it's handled normally by the server
191
+ raise e
192
+
193
+ # Set the name of the wrapper to the name of the original function so that the socketio server can associate it with the right event
194
+ catch_exception_wrapper.__name__ = func.__name__
195
+ return catch_exception_wrapper
196
+
197
+
198
+ async def emit_room_state_update(room):
199
+ await sio.emit(
200
+ "room_state_update",
201
+ room.to_json(),
202
+ room=room.room_id,
203
+ )
204
+
205
+
206
+ async def emit_server_state_update():
207
+ room_statuses = {
208
+ room_id: room.get_room_status_dict() for room_id, room in rooms.items()
209
+ }
210
+ total_active_connections = sum(
211
+ [room_status["activeConnections"] for room_status in room_statuses.values()]
212
+ )
213
+ total_active_transcoders = sum(
214
+ [room_status["activeTranscoders"] for room_status in room_statuses.values()]
215
+ )
216
+ logger.info(
217
+ f"[Server Status]: {total_active_connections} active connections (in rooms); {total_active_transcoders} active transcoders"
218
+ )
219
+ logger.info(f"[Server Status]: server_lock={server_lock}")
220
+ server_lock_object_for_js = (
221
+ {
222
+ "name": server_lock.get("name"),
223
+ "clientID": server_lock.get("client_id"),
224
+ "isActive": server_lock.get("member_object")
225
+ and server_lock.get("member_object").transcoder is not None,
226
+ }
227
+ if server_lock
228
+ else None
229
+ )
230
+ await sio.emit(
231
+ "server_state_update",
232
+ {
233
+ "statusByRoom": room_statuses,
234
+ "totalActiveConnections": total_active_connections,
235
+ "totalActiveTranscoders": total_active_transcoders,
236
+ "agentsCapabilities": agents_capabilities_for_json,
237
+ "serverLock": server_lock_object_for_js,
238
+ },
239
+ room=ALL_ROOM_ID,
240
+ )
241
+
242
+
243
+ async def get_session_data(sid):
244
+ session = await sio.get_session(sid)
245
+ # It seems like if the session has not been set that get_session may return None, so let's provide a fallback empty dictionary here
246
+ return session or {}
247
+
248
+
249
+ async def set_session_data(sid, client_id, room_id, room_object, member_object):
250
+ await sio.save_session(
251
+ sid,
252
+ {
253
+ "client_id": client_id,
254
+ "room_id": room_id,
255
+ "room_object": room_object,
256
+ "member_object": member_object,
257
+ },
258
+ )
259
+
260
+
261
+ def get_random_room_id():
262
+ return "".join(random.choices(ROOM_ID_USABLE_CHARACTERS, k=ROOM_ID_LENGTH))
263
+
264
+
265
+ def get_random_unused_room_id():
266
+ room_id = get_random_room_id()
267
+ while room_id in rooms:
268
+ room_id = get_random_room_id()
269
+ return room_id
270
+
271
+
272
+ ###############################################
273
+ # Socket.io Basic Event Handlers
274
+ ###############################################
275
+
276
+
277
+ @sio.on("connect")
278
+ @catch_and_log_exceptions_for_sio_event_handlers
279
+ async def connect(sid, environ):
280
+ logger.info(f"📥 [event: connected] sid={sid}")
281
+
282
+ # TODO: Sanitize/validate query param input
283
+ query_params = dict(parse.parse_qsl(environ["QUERY_STRING"]))
284
+ client_id = query_params.get("clientID")
285
+
286
+ logger.debug(f"query_params:\n{pformat(query_params)}")
287
+
288
+ if client_id is None:
289
+ logger.info("No clientID provided. Disconnecting...")
290
+ await sio.disconnect(sid)
291
+ return
292
+
293
+ # On reconnect we need to rejoin rooms and reset session data
294
+ if member_directory.get(client_id):
295
+ room = member_directory[client_id].get("room")
296
+ room_id = room.room_id
297
+ # Note: We could also get this from room.members[client_id]
298
+ member = member_directory[client_id].get("member_object")
299
+
300
+ member.connection_status = "connected"
301
+ member.session_id = sid
302
+
303
+ logger.info(
304
+ f"[event: connect] {member} reconnected. Attempting to re-add them to socketio rooms and reset session data."
305
+ )
306
+
307
+ if room is None or member is None:
308
+ logger.error(
309
+ f"[event: connect] {client_id} is reconnecting, but room or member is None. This should not happen."
310
+ )
311
+ await sio.disconnect(sid)
312
+ return
313
+
314
+ sio.enter_room(sid, room_id)
315
+ sio.enter_room(sid, ALL_ROOM_ID)
316
+
317
+ if client_id in room.listeners:
318
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
319
+ if client_id in room.speakers:
320
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
321
+
322
+ # Save the room_id to the socketio client session
323
+ await set_session_data(
324
+ sid,
325
+ client_id=client_id,
326
+ room_id=room.room_id,
327
+ room_object=room,
328
+ member_object=member,
329
+ )
330
+ await emit_room_state_update(room)
331
+ else:
332
+ # Save the client id to the socketio client session
333
+ await set_session_data(
334
+ sid, client_id=client_id, room_id=None, room_object=None, member_object=None
335
+ )
336
+
337
+ await sio.emit("server_id", server_id, to=sid)
338
+ await emit_server_state_update()
339
+
340
+
341
+ @sio.event
342
+ @catch_and_log_exceptions_for_sio_event_handlers
343
+ async def disconnect(sid):
344
+ global server_lock
345
+ session_data = await get_session_data(sid)
346
+ # logger.info("session_data", session_data)
347
+
348
+ client_id = None
349
+ member = None
350
+ room = None
351
+
352
+ if session_data:
353
+ client_id = session_data.get("client_id")
354
+ member = session_data.get("member_object")
355
+ room = session_data.get("room_object")
356
+
357
+ logger.info(
358
+ f"[event: disconnect][{room or 'NOT_IN_ROOM'}] member: {member or 'NO_MEMBER_OBJECT'} disconnected"
359
+ )
360
+
361
+ # Release the lock if this is the client that holds the current server lock
362
+ if server_lock and server_lock.get("client_id") == client_id:
363
+ server_lock = None
364
+
365
+ if member:
366
+ member.connection_status = "disconnected"
367
+
368
+ if member.transcoder:
369
+ member.transcoder.close = True
370
+ member.transcoder = None
371
+ member.requested_output_type = None
372
+
373
+ if room:
374
+ logger.info(
375
+ f"[event: disconnect] {member} disconnected from room {room.room_id}"
376
+ )
377
+ await emit_room_state_update(room)
378
+ else:
379
+ logger.info(
380
+ f"[event: disconnect] {member} disconnected, but no room object present. This should not happen."
381
+ )
382
+ else:
383
+ logger.info(
384
+ f"[event: disconnect] client_id {client_id or 'NO_CLIENT_ID'} with sid {sid} in rooms {str(sio.rooms(sid))} disconnected"
385
+ )
386
+
387
+ await emit_server_state_update()
388
+
389
+
390
+ @sio.on("*")
391
+ async def catch_all(event, sid, data):
392
+ logger.info(f"[unhandled event: {event}] sid={sid} data={data}")
393
+
394
+
395
+ ###############################################
396
+ # Socket.io Streaming Event handlers
397
+ ###############################################
398
+
399
+
400
+ @sio.on("join_room")
401
+ @catch_and_log_exceptions_for_sio_event_handlers
402
+ async def join_room(sid, client_id, room_id_from_client, config_dict):
403
+ global server_lock
404
+
405
+ args = {
406
+ "sid": sid,
407
+ "client_id": client_id,
408
+ "room_id": room_id_from_client,
409
+ "config_dict": config_dict,
410
+ }
411
+ logger.info(f"[event: join_room] {args}")
412
+ session_data = await get_session_data(sid)
413
+ logger.info(f"session_data: {session_data}")
414
+
415
+ room_id = room_id_from_client
416
+ if room_id is None:
417
+ room_id = get_random_unused_room_id()
418
+ logger.info(
419
+ f"No room_id provided. Generating a random, unused room_id: {room_id}"
420
+ )
421
+
422
+ # Create the room if it doesn't already exist
423
+ if room_id not in rooms:
424
+ rooms[room_id] = Room(room_id)
425
+
426
+ room = rooms[room_id]
427
+
428
+ member = None
429
+
430
+ name = "[NO_NAME]"
431
+
432
+ # If the client is reconnecting use their existing member object. Otherwise create a new one.
433
+ if client_id in room.members:
434
+ member = room.members[client_id]
435
+ logger.info(f"{member} is rejoining room {room_id}.")
436
+ else:
437
+ member_number = len(room.members) + 1
438
+ name = f"Member {member_number}"
439
+ member = Member(
440
+ client_id=client_id,
441
+ session_id=sid,
442
+ name=name,
443
+ )
444
+ logger.info(f"Created a new Member object: {member}")
445
+ logger.info(f"Adding {member} to room {room_id}")
446
+ room.members[client_id] = member
447
+
448
+ # Also add them to the member directory
449
+ member_directory[client_id] = {"room": room, "member_object": member}
450
+
451
+ # Join the socketio room, which enables broadcasting to all members of the room
452
+ sio.enter_room(sid, room_id)
453
+ # Join the room for all clients
454
+ sio.enter_room(sid, ALL_ROOM_ID)
455
+
456
+ if "listener" in config_dict["roles"]:
457
+ sio.enter_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
458
+ if client_id not in room.listeners:
459
+ room.listeners.append(client_id)
460
+ else:
461
+ sio.leave_room(sid, f"{room_id}{ROOM_LISTENERS_SUFFIX}")
462
+ room.listeners = [
463
+ listener_id for listener_id in room.listeners if listener_id != client_id
464
+ ]
465
+
466
+ if "speaker" in config_dict["roles"]:
467
+ sio.enter_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
468
+ if client_id not in room.speakers:
469
+ room.speakers.append(client_id)
470
+ else:
471
+ sio.leave_room(sid, f"{room_id}{ROOM_SPEAKERS_SUFFIX}")
472
+ # If the person is no longer a speaker they should no longer be able to lock the server
473
+ if server_lock and server_lock.get("client_id") == client_id:
474
+ logger.info(
475
+ f"🔓 Server is now unlocked from client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}"
476
+ )
477
+ server_lock = None
478
+ if member.transcoder:
479
+ member.transcoder.close = True
480
+ member.transcoder = None
481
+ room.speakers = [
482
+ speaker_id for speaker_id in room.speakers if speaker_id != client_id
483
+ ]
484
+
485
+ # If we currently own the server lock and are updating roles and we no longer have server lock specified, release it
486
+ if (
487
+ server_lock is not None
488
+ and server_lock["client_id"] == client_id
489
+ and config_dict.get("lockServerName") is None
490
+ ):
491
+ logger.info(f"[join_room] Releasing server lock: {pformat(server_lock)}")
492
+ server_lock = None
493
+
494
+ # Only speakers should be able to lock the server
495
+ if config_dict.get("lockServerName") is not None and "speaker" in config_dict.get(
496
+ "roles", {}
497
+ ):
498
+ # If something goes wrong and the server gets stuck in a locked state the client can
499
+ # force the server to remove the lock by passing the special name ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
500
+ if (
501
+ server_lock is not None
502
+ and config_dict.get("lockServerName")
503
+ == ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME
504
+ ):
505
+ server_lock = None
506
+ logger.info(
507
+ f"🔓 Server lock has been reset by {client_id} using the escape hatch name {ESCAPE_HATCH_SERVER_LOCK_RELEASE_NAME}"
508
+ )
509
+
510
+ # If the server is not locked, set a lock. If it's already locked to this client, update the lock object
511
+ elif server_lock is None or server_lock.get("client_id") == client_id:
512
+ # TODO: Add some sort of timeout as a backstop in case someone leaves the browser tab open after locking the server
513
+ server_lock = {
514
+ "name": config_dict.get("lockServerName"),
515
+ "client_id": client_id,
516
+ "member_object": member,
517
+ }
518
+ logger.info(
519
+ f"🔒 Server is now locked to client {server_lock.get('client_id')} with name/info: {server_lock.get('name')}\nThis client will have priority over all others until they disconnect."
520
+ )
521
+ # If the server is already locked to someone else, don't allow this client to lock it
522
+ elif server_lock is not None and server_lock.get("client_id") != client_id:
523
+ logger.warn(
524
+ f"⚠️ Server is already locked to client {server_lock.get('client_id')}. Ignoring request to lock to client {client_id}."
525
+ )
526
+ # TODO: Maybe throw an error here?
527
+
528
+ # Save the room_id to the socketio client session
529
+ await set_session_data(
530
+ sid,
531
+ client_id=client_id,
532
+ room_id=room_id,
533
+ room_object=room,
534
+ member_object=member,
535
+ )
536
+
537
+ await emit_room_state_update(room)
538
+ await emit_server_state_update()
539
+
540
+ return {"roomsJoined": sio.rooms(sid), "roomID": room_id}
541
+
542
+
543
+ # TODO: Add code to prevent more than one speaker from connecting/streaming at a time
544
+ @sio.event
545
+ @catch_and_log_exceptions_for_sio_event_handlers
546
+ async def configure_stream(sid, config):
547
+ session_data = await get_session_data(sid)
548
+ client_id, member, room = itemgetter("client_id", "member_object", "room_object")(
549
+ session_data
550
+ )
551
+
552
+ logger.debug(
553
+ f"[event: configure_stream][{room}] Received stream config from {member}\n{pformat(config)}"
554
+ )
555
+
556
+ if member is None or room is None:
557
+ logger.error(
558
+ f"Received stream config from {member}, but member or room is None. This should not happen."
559
+ )
560
+ return {"status": "error", "message": "member_or_room_is_none"}
561
+
562
+ # If there is a server lock WITH an active transcoder session, prevent other users from configuring and starting a stream
563
+ # If the server lock client does NOT have an active transcoder session allow this to proceed, knowing that
564
+ # this stream will be interrupted if the server lock client starts streaming
565
+ if (
566
+ server_lock is not None
567
+ and server_lock.get("client_id") != client_id
568
+ and server_lock.get("member_object")
569
+ and server_lock.get("member_object").transcoder is not None
570
+ ):
571
+ logger.warn(
572
+ f"Server is locked to client {server_lock.get('client_id')}. Ignoring request to configure stream from client {client_id}."
573
+ )
574
+ return {"status": "error", "message": "server_locked"}
575
+
576
+ debug = config.get("debug")
577
+ async_processing = config.get("async_processing")
578
+
579
+ # Currently s2s, s2t or s2s&t
580
+ model_type = config.get("model_type")
581
+ member.requested_output_type = model_type
582
+
583
+ model_name = config.get("model_name")
584
+
585
+ try:
586
+ agent = available_agents.get_agent_or_throw(model_name)
587
+ except NoAvailableAgentException as e:
588
+ logger.warn(f"Error while getting agent: {e}")
589
+ # await sio.emit("error", str(e), to=sid)
590
+ await sio.disconnect(sid)
591
+ return {"status": "error", "message": str(e)}
592
+
593
+ if member.transcoder:
594
+ logger.warn(
595
+ "Member already has a transcoder configured. Closing it, and overwriting with a new transcoder..."
596
+ )
597
+ member.transcoder.close = True
598
+
599
+ t0 = time.time()
600
+ try:
601
+ member.transcoder = SimulevalTranscoder(
602
+ agent,
603
+ config["rate"],
604
+ debug=debug,
605
+ buffer_limit=int(config["buffer_limit"]),
606
+ )
607
+ except Exception as e:
608
+ logger.warn(f"Got exception while initializing agents: {e}")
609
+ # await sio.emit("error", str(e), to=sid)
610
+ await sio.disconnect(sid)
611
+ return {"status": "error", "message": str(e)}
612
+
613
+ t1 = time.time()
614
+ logger.debug(f"Booting up VAD and transcoder took {t1-t0} sec")
615
+
616
+ # TODO: if async_processing is false, then we need to run transcoder.process_pipeline_once() whenever we receive audio, or at some other sensible interval
617
+ if async_processing:
618
+ member.transcoder.start()
619
+
620
+ # We need to emit a room state update here since room state now includes # of active transcoders
621
+ await emit_room_state_update(room)
622
+ await emit_server_state_update()
623
+
624
+ return {"status": "ok", "message": "server_ready"}
625
+
626
+
627
+ # The config here is a partial config, meaning it may not contain all the config values -- only the ones the user
628
+ # wants to change
629
+ @sio.on("set_dynamic_config")
630
+ @catch_and_log_exceptions_for_sio_event_handlers
631
+ async def set_dynamic_config(
632
+ sid,
633
+ # partial_config's type is defined in StreamingTypes.ts
634
+ partial_config,
635
+ ):
636
+ session_data = await get_session_data(sid)
637
+
638
+ member = None
639
+
640
+ if session_data:
641
+ member = session_data.get("member_object")
642
+
643
+ if member:
644
+ new_dynamic_config = {
645
+ **(member.transcoder_dynamic_config or {}),
646
+ **partial_config,
647
+ }
648
+ logger.info(
649
+ f"[set_dynamic_config] Setting new dynamic config:\n\n{pformat(new_dynamic_config)}\n"
650
+ )
651
+ member.transcoder_dynamic_config = new_dynamic_config
652
+
653
+ return {"status": "ok", "message": "dynamic_config_set"}
654
+
655
+
656
+ @sio.event
657
+ @catch_and_log_exceptions_for_sio_event_handlers
658
+ async def incoming_audio(sid, blob):
659
+ session_data = await get_session_data(sid)
660
+
661
+ client_id = None
662
+ member = None
663
+ room = None
664
+
665
+ if session_data:
666
+ client_id = session_data.get("client_id")
667
+ member = session_data.get("member_object")
668
+ room = session_data.get("room_object")
669
+
670
+ logger.debug(f"[event: incoming_audio] from member {member}")
671
+
672
+ # If the server is locked by someone else, kill our transcoder and ignore incoming audio
673
+ # If the server lock client does NOT have an active transcoder session allow this incoming audio pipeline to proceed,
674
+ # knowing that this stream will be interrupted if the server lock client starts streaming
675
+ if (
676
+ server_lock is not None
677
+ and server_lock.get("client_id") != client_id
678
+ and server_lock.get("member_object")
679
+ and server_lock.get("member_object").transcoder is not None
680
+ ):
681
+ # TODO: Send an event to the client to let them know their streaming session has been killed
682
+ if member.transcoder:
683
+ member.transcoder.close = True
684
+ member.transcoder = None
685
+ # Update both room state and server state given that the number of active transcoders has changed
686
+ if room:
687
+ await emit_room_state_update(room)
688
+ await emit_server_state_update()
689
+ logger.warn(
690
+ f"[incoming_audio] Server is locked to client {server_lock.get('client_id')}. Ignoring incoming audio from client {client_id}."
691
+ )
692
+ return
693
+
694
+ if member is None or room is None:
695
+ logger.error(
696
+ f"[incoming_audio] Received incoming_audio from {member}, but member or room is None. This should not happen."
697
+ )
698
+ return
699
+
700
+ # NOTE: bytes and bytearray are very similar, but bytes is immutable, and is what is returned by socketio
701
+ if not isinstance(blob, bytes):
702
+ logger.error(
703
+ f"[incoming_audio] Received audio from {member}, but it was not of type `bytes`. type(blob) = {type(blob)}"
704
+ )
705
+ return
706
+
707
+ if member.transcoder is None:
708
+ logger.error(
709
+ f"[incoming_audio] Received audio from {member}, but no transcoder configured to process it (member.transcoder is None). This should not happen."
710
+ )
711
+ return
712
+
713
+ member.transcoder.process_incoming_bytes(
714
+ blob, dynamic_config=member.transcoder_dynamic_config
715
+ )
716
+
717
+ # Send back any available model output
718
+ # NOTE: In theory it would make sense remove this from the incoming_audio handler and
719
+ # handle this in a dedicated thread that checks for output and sends it right away,
720
+ # but in practice for our limited demo use cases this approach didn't add noticeable
721
+ # latency, so we're keeping it simple for now.
722
+ events = get_transcoder_output_events(member.transcoder)
723
+ logger.debug(f"[incoming_audio] transcoder output events: {len(events)}")
724
+
725
+ if len(events) == 0:
726
+ logger.debug("[incoming_audio] No transcoder output to send")
727
+ else:
728
+ for e in events:
729
+ if e["event"] == "translation_speech" and member.requested_output_type in [
730
+ "s2s",
731
+ "s2s&t",
732
+ ]:
733
+ logger.debug("[incoming_audio] Sending translation_speech event")
734
+ await sio.emit(
735
+ "translation_speech", e, room=f"{room.room_id}_listeners"
736
+ )
737
+ elif e["event"] == "translation_text" and member.requested_output_type in [
738
+ "s2t",
739
+ "s2s&t",
740
+ ]:
741
+ logger.debug("[incoming_audio] Sending translation_text event")
742
+ await sio.emit("translation_text", e, room=f"{room.room_id}_listeners")
743
+ else:
744
+ logger.error(f"[incoming_audio] Unexpected event type: {e['event']}")
745
+
746
+ return
747
+
748
+
749
+ @sio.event
750
+ @catch_and_log_exceptions_for_sio_event_handlers
751
+ async def stop_stream(sid):
752
+ session_data = await get_session_data(sid)
753
+ client_id, member, room = itemgetter("client_id", "member_object", "room_object")(
754
+ session_data
755
+ )
756
+
757
+ logger.debug(f"[event: stop_stream][{room}] Attempting to stop stream for {member}")
758
+
759
+ if member is None or room is None:
760
+ message = f"Received stop_stream from {member}, but member or room is None. This should not happen."
761
+ logger.error(message)
762
+ return {"status": "error", "message": message}
763
+
764
+ # In order to stop the stream and end the transcoder thread, set close to True and unset it for the member
765
+ if member.transcoder:
766
+ member.transcoder.close = True
767
+ member.transcoder = None
768
+ else:
769
+ message = f"Received stop_stream from {member}, but member.transcoder is None. This should not happen."
770
+ logger.warn(message)
771
+
772
+ # We need to emit a room state update here since room state now includes # of active transcoders
773
+ await emit_room_state_update(room)
774
+ # Emit a server state update now that we've changed the number of active transcoders
775
+ await emit_server_state_update()
776
+
777
+ return {"status": "ok", "message": "Stream stopped"}
778
+
779
+
780
+ @sio.on("clear_transcript_for_all")
781
+ @catch_and_log_exceptions_for_sio_event_handlers
782
+ async def clear_transcript_for_all(sid):
783
+ session_data = await get_session_data(sid)
784
+
785
+ room = session_data.get("room_object")
786
+
787
+ if room:
788
+ await sio.emit("clear_transcript", room=f"{room.room_id}")
789
+ else:
790
+ logger.error("[clear_transcript] room is None. This should not happen.")
791
+
792
+
793
+ @sio.event
794
+ @catch_and_log_exceptions_for_sio_event_handlers
795
+ async def set_name(sid, name):
796
+ logger.info(f"[Event: set_name] name={name}")
797
+ await sio.save_session(sid, {"name": name})
seamless_server/debug/.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ # Ignore everything in this directory
2
+ *
3
+ # Except this file
4
+ !.gitignore
seamless_server/models/SeamlessStreaming/vad_s2st_sc_24khz_main.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ agent_class: seamless_communication.streaming.agents.seamless_streaming_s2st.SeamlessStreamingS2STJointVADAgent
2
+ monotonic_decoder_model_name: seamless_streaming_monotonic_decoder
3
+ unity_model_name: seamless_streaming_unity
4
+ sentencepiece_model: spm_256k_nllb100.model
5
+
6
+ task: s2st
7
+ tgt_lang: "eng"
8
+ min_unit_chunk_size: 50
9
+ decision_threshold: 0.7
10
+ no_early_stop: True
11
+ block_ngrams: True
12
+ vocoder_name: vocoder_v2
13
+ wav2vec_yaml: wav2vec.yaml
14
+ min_starting_wait_w2vbert: 192
15
+
16
+ config_yaml: cfg_fbank_u2t.yaml
17
+ upstream_idx: 1
18
+ detokenize_only: True
19
+ device: cuda:0
20
+ max_len_a: 0
21
+ max_len_b: 1000
seamless_server/requirements.txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --pre --extra-index-url https://fair.pkg.atmeta.com/fairseq2/pt2.1.1/cu118
2
+ simuleval==1.1.3
3
+ # seamless_communication
4
+ ./whl/seamless_communication-1.0.0-py3-none-any.whl
5
+ Flask==2.1.3
6
+ Flask_Sockets==0.2.1
7
+ g2p_en==2.1.0
8
+ gevent==22.10.2
9
+ gevent_websocket==0.10.1
10
+ librosa==0.9.2
11
+ numpy==1.24.4
12
+ openai_whisper==20230124
13
+ protobuf==4.24.2
14
+ psola==0.0.1
15
+ pydub==0.25.1
16
+ silero==0.4.1
17
+ # simuleval==1.1.1
18
+ soundfile==0.11.0
19
+ stable_ts==1.4.0
20
+ # torch # to be installed by user for desired PyTorch version
21
+ Werkzeug==2.0.3
22
+ whisper==1.1.10
23
+ colorlog==6.7.0
24
+ python-socketio==5.9.0
25
+ uvicorn[standard]==0.23.2
26
+ parallel-wavegan==0.5.5
27
+ python-jose[cryptography]==3.3.0
28
+ starlette==0.32.0.post1
29
+ hf_transfer==0.1.4
30
+ huggingface_hub==0.19.4
seamless_server/src/room.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import json
2
+ import uuid
3
+
4
+
5
+ class Room:
6
+ def __init__(self, room_id) -> None:
7
+ self.room_id = room_id
8
+ # members is a dict from client_id to Member
9
+ self.members = {}
10
+
11
+ # listeners and speakers are lists of client_id's
12
+ self.listeners = []
13
+ self.speakers = []
14
+
15
+ def __str__(self) -> str:
16
+ return f"Room {self.room_id} ({len(self.members)} member{'s' if len(self.members) == 1 else ''})"
17
+
18
+ def to_json(self):
19
+ varsResult = vars(self)
20
+ # Remember: result is just a shallow copy, so result.members === self.members
21
+ # Because of that, we need to jsonify self.members without writing over result.members,
22
+ # which we do here via dictionary unpacking (the ** operator)
23
+ result = {
24
+ **varsResult,
25
+ "members": {key: value.to_json() for (key, value) in self.members.items()},
26
+ "activeTranscoders": self.get_active_transcoders(),
27
+ }
28
+
29
+ return result
30
+
31
+ def get_active_connections(self):
32
+ return len(
33
+ [m for m in self.members.values() if m.connection_status == "connected"]
34
+ )
35
+
36
+ def get_active_transcoders(self):
37
+ return len([m for m in self.members.values() if m.transcoder is not None])
38
+
39
+ def get_room_status_dict(self):
40
+ return {
41
+ "activeConnections": self.get_active_connections(),
42
+ "activeTranscoders": self.get_active_transcoders(),
43
+ }
44
+
45
+
46
+ class Member:
47
+ def __init__(self, client_id, session_id, name) -> None:
48
+ self.client_id = client_id
49
+ self.session_id = session_id
50
+ self.name = name
51
+ self.connection_status = "connected"
52
+ self.transcoder = None
53
+ self.requested_output_type = None
54
+ self.transcoder_dynamic_config = None
55
+
56
+ def __str__(self) -> str:
57
+ return f"{self.name} (id: {self.client_id[:4]}...) ({self.connection_status})"
58
+
59
+ def to_json(self):
60
+ self_vars = vars(self)
61
+ return {
62
+ **self_vars,
63
+ "transcoder": self.transcoder is not None,
64
+ }
seamless_server/src/simuleval_agent_directory.py ADDED
@@ -0,0 +1,150 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Creates a directory in which to look up available agents
2
+
3
+ from typing import List
4
+ from src.simuleval_transcoder import SimulevalTranscoder
5
+ import json
6
+ import logging
7
+
8
+ logger = logging.getLogger("socketio_server_pubsub")
9
+
10
+ # fmt: off
11
+ M4T_P0_LANGS = [
12
+ "eng",
13
+ "arb", "ben", "cat", "ces", "cmn", "cym", "dan",
14
+ "deu", "est", "fin", "fra", "hin", "ind", "ita",
15
+ "jpn", "kor", "mlt", "nld", "pes", "pol", "por",
16
+ "ron", "rus", "slk", "spa", "swe", "swh", "tel",
17
+ "tgl", "tha", "tur", "ukr", "urd", "uzn", "vie",
18
+ ]
19
+ # fmt: on
20
+
21
+
22
+ class NoAvailableAgentException(Exception):
23
+ pass
24
+
25
+
26
+ class AgentWithInfo:
27
+ def __init__(
28
+ self,
29
+ agent,
30
+ name: str,
31
+ modalities: List[str],
32
+ target_langs: List[str],
33
+ # Supported dynamic params are defined in StreamingTypes.ts
34
+ dynamic_params: List[str] = [],
35
+ description="",
36
+ ):
37
+ self.agent = agent
38
+ self.name = name
39
+ self.description = description
40
+ self.modalities = modalities
41
+ self.target_langs = target_langs
42
+ self.dynamic_params = dynamic_params
43
+
44
+ def get_capabilities_for_json(self):
45
+ return {
46
+ "name": self.name,
47
+ "description": self.description,
48
+ "modalities": self.modalities,
49
+ "targetLangs": self.target_langs,
50
+ "dynamicParams": self.dynamic_params,
51
+ }
52
+
53
+ @classmethod
54
+ def load_from_json(cls, config: str):
55
+ """
56
+ Takes in JSON array of models to load in, e.g.
57
+ [{"name": "s2s_m4t_emma-unity2_multidomain_v0.1", "description": "M4T model that supports simultaneous S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]},
58
+ {"name": "s2s_m4t_expr-emma_v0.1", "description": "ES-EN expressive model that supports S2S and S2T", "modalities": ["s2t", "s2s"], "targetLangs": ["en"]}]
59
+ """
60
+ configs = json.loads(config)
61
+ agents = []
62
+ for config in configs:
63
+ agent = SimulevalTranscoder.build_agent(config["name"])
64
+ agents.append(
65
+ AgentWithInfo(
66
+ agent=agent,
67
+ name=config["name"],
68
+ modalities=config["modalities"],
69
+ target_langs=config["targetLangs"],
70
+ )
71
+ )
72
+ return agents
73
+
74
+
75
+ class SimulevalAgentDirectory:
76
+ # Available models. These are the directories where the models can be found, and also serve as an ID for the model.
77
+ seamless_streaming_agent = "SeamlessStreaming"
78
+
79
+ def __init__(self):
80
+ self.agents = []
81
+ self.did_build_and_add_agents = False
82
+
83
+ def add_agent(self, agent: AgentWithInfo):
84
+ self.agents.append(agent)
85
+
86
+ def build_agent_if_available(self, model_id, config_name=None):
87
+ agent = None
88
+ try:
89
+ if config_name is not None:
90
+ agent = SimulevalTranscoder.build_agent(
91
+ model_id,
92
+ config_name=config_name,
93
+ )
94
+ else:
95
+ agent = SimulevalTranscoder.build_agent(
96
+ model_id,
97
+ )
98
+ except Exception as e:
99
+ logger.warning("Failed to build agent %s: %s" % (model_id, e))
100
+ raise e
101
+
102
+ return agent
103
+
104
+ def build_and_add_agents(self, models_override=None):
105
+ if self.did_build_and_add_agents:
106
+ return
107
+
108
+ if models_override is not None:
109
+ agent_infos = AgentWithInfo.load_from_json(models_override)
110
+ for agent_info in agent_infos:
111
+ self.add_agent(agent_info)
112
+ else:
113
+ s2s_m4t_expr_agent = self.build_agent_if_available(
114
+ SimulevalAgentDirectory.seamless_streaming_agent,
115
+ config_name="vad_s2st_sc_24khz_main.yaml",
116
+ )
117
+
118
+ if s2s_m4t_expr_agent:
119
+ self.add_agent(
120
+ AgentWithInfo(
121
+ agent=s2s_m4t_expr_agent,
122
+ name=SimulevalAgentDirectory.seamless_streaming_agent,
123
+ modalities=["s2t", "s2s"],
124
+ target_langs=M4T_P0_LANGS,
125
+ dynamic_params=["expressive"],
126
+ description="multilingual expressive model that supports S2S and S2T",
127
+ )
128
+ )
129
+
130
+ if len(self.agents) == 0:
131
+ logger.error(
132
+ "No agents were loaded. This likely means you are missing the actual model files specified in simuleval_agent_directory."
133
+ )
134
+
135
+ self.did_build_and_add_agents = True
136
+
137
+ def get_agent(self, name):
138
+ for agent in self.agents:
139
+ if agent.name == name:
140
+ return agent.agent
141
+ return None
142
+
143
+ def get_agent_or_throw(self, name):
144
+ agent = self.get_agent(name)
145
+ if agent is None:
146
+ raise NoAvailableAgentException("No agent found with name= %s" % (name))
147
+ return agent
148
+
149
+ def get_agents_capabilities_list_for_json(self):
150
+ return [agent.get_capabilities_for_json() for agent in self.agents]
seamless_server/src/simuleval_transcoder.py ADDED
@@ -0,0 +1,423 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from simuleval.utils.agent import build_system_from_dir
2
+ from typing import Any, List, Optional, Tuple, Union
3
+ import numpy as np
4
+ import soundfile
5
+ import io
6
+ import asyncio
7
+ from simuleval.agents.pipeline import TreeAgentPipeline
8
+ from simuleval.agents.states import AgentStates
9
+ from simuleval.data.segments import Segment, EmptySegment, SpeechSegment
10
+ import threading
11
+ import math
12
+ import logging
13
+ import sys
14
+ from pathlib import Path
15
+ import time
16
+ from g2p_en import G2p
17
+ import torch
18
+ import traceback
19
+ import time
20
+ import random
21
+ import colorlog
22
+
23
+ from .speech_and_text_output import SpeechAndTextOutput
24
+
25
+ MODEL_SAMPLE_RATE = 16_000
26
+
27
+ logger = logging.getLogger(__name__)
28
+ # logger.propagate = False
29
+ handler = colorlog.StreamHandler(stream=sys.stdout)
30
+ formatter = colorlog.ColoredFormatter(
31
+ "%(log_color)s[%(asctime)s][%(levelname)s][%(module)s]:%(reset)s %(message)s",
32
+ reset=True,
33
+ log_colors={
34
+ "DEBUG": "cyan",
35
+ "INFO": "green",
36
+ "WARNING": "yellow",
37
+ "ERROR": "red",
38
+ "CRITICAL": "red,bg_white",
39
+ },
40
+ )
41
+ handler.setFormatter(formatter)
42
+ logger.addHandler(handler)
43
+ logger.setLevel(logging.WARNING)
44
+
45
+
46
+ class OutputSegments:
47
+ def __init__(self, segments: Union[List[Segment], Segment]):
48
+ if isinstance(segments, Segment):
49
+ segments = [segments]
50
+ self.segments: List[Segment] = [s for s in segments]
51
+
52
+ @property
53
+ def is_empty(self):
54
+ return all(segment.is_empty for segment in self.segments)
55
+
56
+ @property
57
+ def finished(self):
58
+ return all(segment.finished for segment in self.segments)
59
+
60
+ def compute_length(self, g2p):
61
+ lengths = []
62
+ for segment in self.segments:
63
+ if segment.data_type == "text":
64
+ lengths.append(len([x for x in g2p(segment.content) if x != " "]))
65
+ elif segment.data_type == "speech":
66
+ lengths.append(len(segment.content) / MODEL_SAMPLE_RATE)
67
+ elif isinstance(segment, EmptySegment):
68
+ continue
69
+ else:
70
+ logger.warning(
71
+ f"Unexpected data_type: {segment.data_type} not in 'speech', 'text'"
72
+ )
73
+ return max(lengths)
74
+
75
+ @classmethod
76
+ def join_output_buffer(
77
+ cls, buffer: List[List[Segment]], output: SpeechAndTextOutput
78
+ ):
79
+ num_segments = len(buffer[0])
80
+ for i in range(num_segments):
81
+ segment_list = [
82
+ buffer[j][i]
83
+ for j in range(len(buffer))
84
+ if buffer[j][i].data_type is not None
85
+ ]
86
+ if len(segment_list) == 0:
87
+ continue
88
+ if len(set(segment.data_type for segment in segment_list)) != 1:
89
+ logger.warning(
90
+ f"Data type mismatch at {i}: {set(segment.data_type for segment in segment_list)}"
91
+ )
92
+ continue
93
+ data_type = segment_list[0].data_type
94
+ if data_type == "text":
95
+ if output.text is not None:
96
+ logger.warning("Multiple text outputs, overwriting!")
97
+ output.text = " ".join([segment.content for segment in segment_list])
98
+ elif data_type == "speech":
99
+ if output.speech_samples is not None:
100
+ logger.warning("Multiple speech outputs, overwriting!")
101
+ speech_out = []
102
+ for segment in segment_list:
103
+ speech_out += segment.content
104
+ output.speech_samples = speech_out
105
+ output.speech_sample_rate = segment.sample_rate
106
+ elif isinstance(segment_list[0], EmptySegment):
107
+ continue
108
+ else:
109
+ logger.warning(
110
+ f"Invalid output buffer data type: {data_type}, expected 'speech' or 'text"
111
+ )
112
+
113
+ return output
114
+
115
+ def __repr__(self) -> str:
116
+ repr_str = str(self.segments)
117
+ return f"{self.__class__.__name__}(\n\t{repr_str}\n)"
118
+
119
+
120
+ class SimulevalTranscoder:
121
+ def __init__(self, agent, sample_rate, debug, buffer_limit):
122
+ self.agent = agent
123
+ self.input_queue = asyncio.Queue()
124
+ self.output_queue = asyncio.Queue()
125
+ self.states = self.agent.build_states()
126
+ if debug:
127
+ self.get_states_root().debug = True
128
+ self.incoming_sample_rate = sample_rate
129
+ self.close = False
130
+ self.g2p = G2p()
131
+
132
+ # buffer all outgoing translations within this amount of time
133
+ self.output_buffer_idle_ms = 5000
134
+ self.output_buffer_size_limit = (
135
+ buffer_limit # phonemes for text, seconds for speech
136
+ )
137
+ self.output_buffer_cur_size = 0
138
+ self.output_buffer: List[List[Segment]] = []
139
+ self.speech_output_sample_rate = None
140
+
141
+ self.last_output_ts = time.time() * 1000
142
+ self.timeout_ms = (
143
+ 30000 # close the transcoder thread after this amount of silence
144
+ )
145
+ self.first_input_ts = None
146
+ self.first_output_ts = None
147
+ self.debug = debug
148
+ self.debug_ts = f"{time.time()}_{random.randint(1000, 9999)}"
149
+ if self.debug:
150
+ debug_folder = Path(__file__).resolve().parent.parent / "debug"
151
+ self.test_incoming_wav = soundfile.SoundFile(
152
+ debug_folder / f"{self.debug_ts}_test_incoming.wav",
153
+ mode="w+",
154
+ format="WAV",
155
+ subtype="PCM_16",
156
+ samplerate=self.incoming_sample_rate,
157
+ channels=1,
158
+ )
159
+ self.get_states_root().test_input_segments_wav = soundfile.SoundFile(
160
+ debug_folder / f"{self.debug_ts}_test_input_segments.wav",
161
+ mode="w+",
162
+ format="WAV",
163
+ samplerate=MODEL_SAMPLE_RATE,
164
+ channels=1,
165
+ )
166
+
167
+ def get_states_root(self) -> AgentStates:
168
+ if isinstance(self.agent, TreeAgentPipeline):
169
+ # self.states is a dict
170
+ return self.states[self.agent.source_module]
171
+ else:
172
+ # self.states is a list
173
+ return self.states[0]
174
+
175
+ def reset_states(self):
176
+ if isinstance(self.agent, TreeAgentPipeline):
177
+ states_iter = self.states.values()
178
+ else:
179
+ states_iter = self.states
180
+ for state in states_iter:
181
+ state.reset()
182
+
183
+ def debug_log(self, *args):
184
+ if self.debug:
185
+ logger.info(*args)
186
+
187
+ @classmethod
188
+ def build_agent(cls, model_path, config_name="vad_s2st_main.yaml"):
189
+ logger.info(f"Building simuleval agent: {model_path}, {config_name}")
190
+ agent = build_system_from_dir(
191
+ Path(__file__).resolve().parent.parent / f"models/{model_path}",
192
+ config_name=config_name,
193
+ )
194
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
195
+ agent.to(device, fp16=True)
196
+ logger.info(
197
+ f"Successfully built simuleval agent {model_path} on device {device}"
198
+ )
199
+
200
+ return agent
201
+
202
+ def process_incoming_bytes(self, incoming_bytes, dynamic_config):
203
+ # TODO: We probably want to do some validation on dynamic_config to ensure it has what we needs
204
+ segment, sr = self._preprocess_wav(incoming_bytes)
205
+ segment = SpeechSegment(
206
+ content=segment,
207
+ sample_rate=sr,
208
+ tgt_lang=dynamic_config.get("targetLanguage"),
209
+ config=dynamic_config,
210
+ )
211
+ # # segment is array([0, 0, 0, ..., 0, 0, 0], dtype=int16)
212
+ self.input_queue.put_nowait(segment)
213
+
214
+ def get_input_segment(self):
215
+ if self.input_queue.empty():
216
+ return None
217
+ chunk = self.input_queue.get_nowait()
218
+ self.input_queue.task_done()
219
+ return chunk
220
+
221
+ def convert_waveform(
222
+ self,
223
+ waveform: Union[np.ndarray, torch.Tensor],
224
+ sample_rate: int,
225
+ normalize_volume: bool = False,
226
+ to_mono: bool = False,
227
+ to_sample_rate: Optional[int] = None,
228
+ ) -> Tuple[Union[np.ndarray, torch.Tensor], int]:
229
+ """convert a waveform:
230
+ - to a target sample rate
231
+ - from multi-channel to mono channel
232
+ - volume normalization
233
+
234
+ Args:
235
+ waveform (numpy.ndarray or torch.Tensor): 2D original waveform
236
+ (channels x length)
237
+ sample_rate (int): original sample rate
238
+ normalize_volume (bool): perform volume normalization
239
+ to_mono (bool): convert to mono channel if having multiple channels
240
+ to_sample_rate (Optional[int]): target sample rate
241
+ Returns:
242
+ waveform (numpy.ndarray): converted 2D waveform (channels x length)
243
+ sample_rate (float): target sample rate
244
+ """
245
+ try:
246
+ import torchaudio.sox_effects as ta_sox
247
+ except ImportError:
248
+ raise ImportError("Please install torchaudio: pip install torchaudio")
249
+
250
+ effects = []
251
+ if normalize_volume:
252
+ effects.append(["gain", "-n"])
253
+ if to_sample_rate is not None and to_sample_rate != sample_rate:
254
+ effects.append(["rate", f"{to_sample_rate}"])
255
+ if to_mono and waveform.shape[0] > 1:
256
+ effects.append(["channels", "1"])
257
+ if len(effects) > 0:
258
+ is_np_input = isinstance(waveform, np.ndarray)
259
+ _waveform = torch.from_numpy(waveform) if is_np_input else waveform
260
+ converted, converted_sample_rate = ta_sox.apply_effects_tensor(
261
+ _waveform, sample_rate, effects
262
+ )
263
+ if is_np_input:
264
+ converted = converted.numpy()
265
+ return converted, converted_sample_rate
266
+ return waveform, sample_rate
267
+
268
+ def _preprocess_wav(self, data: Any) -> Tuple[np.ndarray, int]:
269
+ segment, sample_rate = soundfile.read(
270
+ io.BytesIO(data),
271
+ dtype="float32",
272
+ always_2d=True,
273
+ frames=-1,
274
+ start=0,
275
+ format="RAW",
276
+ subtype="PCM_16",
277
+ samplerate=self.incoming_sample_rate,
278
+ channels=1,
279
+ )
280
+ if self.debug:
281
+ self.test_incoming_wav.seek(0, soundfile.SEEK_END)
282
+ self.test_incoming_wav.write(segment)
283
+
284
+ segment = segment.T
285
+ segment, new_sample_rate = self.convert_waveform(
286
+ segment,
287
+ sample_rate,
288
+ normalize_volume=False,
289
+ to_mono=True,
290
+ to_sample_rate=MODEL_SAMPLE_RATE,
291
+ )
292
+
293
+ assert MODEL_SAMPLE_RATE == new_sample_rate
294
+ segment = segment.squeeze(axis=0)
295
+ return segment, new_sample_rate
296
+
297
+ def process_pipeline_impl(self, input_segment):
298
+ try:
299
+ with torch.no_grad():
300
+ output_segment = OutputSegments(
301
+ self.agent.pushpop(input_segment, self.states)
302
+ )
303
+ if (
304
+ self.get_states_root().first_input_ts is not None
305
+ and self.first_input_ts is None
306
+ ):
307
+ # TODO: this is hacky
308
+ self.first_input_ts = self.get_states_root().first_input_ts
309
+
310
+ if not output_segment.is_empty:
311
+ self.output_queue.put_nowait(output_segment)
312
+
313
+ if output_segment.finished:
314
+ self.debug_log("OUTPUT SEGMENT IS FINISHED. Resetting states.")
315
+
316
+ self.reset_states()
317
+
318
+ if self.debug:
319
+ # when we rebuild states, this value is reset to whatever
320
+ # is in the system dir config, which defaults debug=False.
321
+ self.get_states_root().debug = True
322
+ except Exception as e:
323
+ logger.error(f"Got exception while processing pipeline: {e}")
324
+ traceback.print_exc()
325
+ return input_segment
326
+
327
+ def process_pipeline_loop(self):
328
+ if self.close:
329
+ return # closes the thread
330
+
331
+ self.debug_log("processing_pipeline")
332
+ while not self.close:
333
+ input_segment = self.get_input_segment()
334
+ if input_segment is None:
335
+ if self.get_states_root().is_fresh_state: # TODO: this is hacky
336
+ time.sleep(0.3)
337
+ else:
338
+ time.sleep(0.03)
339
+ continue
340
+ self.process_pipeline_impl(input_segment)
341
+ self.debug_log("finished processing_pipeline")
342
+
343
+ def process_pipeline_once(self):
344
+ if self.close:
345
+ return
346
+
347
+ self.debug_log("processing pipeline once")
348
+ input_segment = self.get_input_segment()
349
+ if input_segment is None:
350
+ return
351
+ self.process_pipeline_impl(input_segment)
352
+ self.debug_log("finished processing_pipeline_once")
353
+
354
+ def get_output_segment(self):
355
+ if self.output_queue.empty():
356
+ return None
357
+
358
+ output_chunk = self.output_queue.get_nowait()
359
+ self.output_queue.task_done()
360
+ return output_chunk
361
+
362
+ def start(self):
363
+ self.debug_log("starting transcoder in a thread")
364
+ threading.Thread(target=self.process_pipeline_loop).start()
365
+
366
+ def first_translation_time(self):
367
+ return round((self.first_output_ts - self.first_input_ts) / 1000, 2)
368
+
369
+ def get_buffered_output(self) -> SpeechAndTextOutput:
370
+ now = time.time() * 1000
371
+ self.debug_log(f"get_buffered_output queue size: {self.output_queue.qsize()}")
372
+ while not self.output_queue.empty():
373
+ tmp_out = self.get_output_segment()
374
+ if tmp_out and tmp_out.compute_length(self.g2p) > 0:
375
+ if len(self.output_buffer) == 0:
376
+ self.last_output_ts = now
377
+ self._populate_output_buffer(tmp_out)
378
+ self._increment_output_buffer_size(tmp_out)
379
+
380
+ if tmp_out.finished:
381
+ self.debug_log("tmp_out.finished")
382
+ res = self._gather_output_buffer_data(final=True)
383
+ self.debug_log(f"gathered output data: {res}")
384
+ self.output_buffer = []
385
+ self.increment_output_buffer_size = 0
386
+ self.last_output_ts = now
387
+ self.first_output_ts = now
388
+ return res
389
+ else:
390
+ self.debug_log("tmp_out.compute_length is not > 0")
391
+
392
+ if len(self.output_buffer) > 0 and (
393
+ now - self.last_output_ts >= self.output_buffer_idle_ms
394
+ or self.output_buffer_cur_size >= self.output_buffer_size_limit
395
+ ):
396
+ self.debug_log(
397
+ "[get_buffered_output] output_buffer is not empty. getting res to return."
398
+ )
399
+ self.last_output_ts = now
400
+ res = self._gather_output_buffer_data(final=False)
401
+ self.debug_log(f"gathered output data: {res}")
402
+ self.output_buffer = []
403
+ self.output_buffer_phoneme_count = 0
404
+ self.first_output_ts = now
405
+ return res
406
+ else:
407
+ self.debug_log("[get_buffered_output] output_buffer is empty...")
408
+ return None
409
+
410
+ def _gather_output_buffer_data(self, final):
411
+ output = SpeechAndTextOutput()
412
+ output.final = final
413
+ output = OutputSegments.join_output_buffer(self.output_buffer, output)
414
+ return output
415
+
416
+ def _increment_output_buffer_size(self, segment: OutputSegments):
417
+ self.output_buffer_cur_size += segment.compute_length(self.g2p)
418
+
419
+ def _populate_output_buffer(self, segment: OutputSegments):
420
+ self.output_buffer.append(segment.segments)
421
+
422
+ def _compute_phoneme_count(self, string: str) -> int:
423
+ return len([x for x in self.g2p(string) if x != " "])
seamless_server/src/speech_and_text_output.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Provides a container to return both speech and text output from our model at the same time
2
+
3
+
4
+ class SpeechAndTextOutput:
5
+ def __init__(
6
+ self,
7
+ text: str = None,
8
+ speech_samples: list = None,
9
+ speech_sample_rate: float = None,
10
+ final: bool = False,
11
+ ):
12
+ self.text = text
13
+ self.speech_samples = speech_samples
14
+ self.speech_sample_rate = speech_sample_rate
15
+ self.final = final
seamless_server/src/transcoder_helpers.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ logger = logging.getLogger("socketio_server_pubsub")
4
+
5
+
6
+ def get_transcoder_output_events(transcoder) -> list:
7
+ speech_and_text_output = transcoder.get_buffered_output()
8
+ if speech_and_text_output is None:
9
+ logger.debug("No output from transcoder.get_buffered_output()")
10
+ return []
11
+
12
+ logger.debug(f"We DID get output from the transcoder! {speech_and_text_output}")
13
+
14
+ lat = None
15
+
16
+ events = []
17
+
18
+ if speech_and_text_output.speech_samples:
19
+ events.append(
20
+ {
21
+ "event": "translation_speech",
22
+ "payload": speech_and_text_output.speech_samples,
23
+ "sample_rate": speech_and_text_output.speech_sample_rate,
24
+ }
25
+ )
26
+
27
+ if speech_and_text_output.text:
28
+ events.append(
29
+ {
30
+ "event": "translation_text",
31
+ "payload": speech_and_text_output.text,
32
+ }
33
+ )
34
+
35
+ for e in events:
36
+ e["eos"] = speech_and_text_output.final
37
+
38
+ # if not latency_sent:
39
+ # lat = transcoder.first_translation_time()
40
+ # latency_sent = True
41
+ # to_send["latency"] = lat
42
+
43
+ return events
seamless_server/whl/seamless_communication-1.0.0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1df10e0c85ee0ffbc9f2e1bf8896850a52c551383df0332a94d26d9d39770c85
3
+ size 201552
streaming-react-app/.eslintrc.cjs ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ module.exports = {
2
+ root: true,
3
+ env: {browser: true, es2020: true},
4
+ extends: [
5
+ 'eslint:recommended',
6
+ 'plugin:@typescript-eslint/recommended',
7
+ 'plugin:react-hooks/recommended',
8
+ ],
9
+ ignorePatterns: ['dist', '.eslintrc.cjs'],
10
+ parser: '@typescript-eslint/parser',
11
+ plugins: ['react-refresh'],
12
+ rules: {
13
+ 'react-refresh/only-export-components': [
14
+ 'warn',
15
+ {allowConstantExport: true},
16
+ ],
17
+ },
18
+ };
streaming-react-app/.gitignore ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Logs
2
+ logs
3
+ *.log
4
+ npm-debug.log*
5
+ yarn-debug.log*
6
+ yarn-error.log*
7
+ pnpm-debug.log*
8
+ lerna-debug.log*
9
+
10
+ node_modules
11
+ dist
12
+ dist-ssr
13
+ *.local
14
+
15
+ # Editor directories and files
16
+ .vscode/*
17
+ !.vscode/extensions.json
18
+ .idea
19
+ .DS_Store
20
+ *.suo
21
+ *.ntvs*
22
+ *.njsproj
23
+ *.sln
24
+ *.sw?
streaming-react-app/README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🚀 Streaming React App
2
+
3
+ ## Getting Started
4
+
5
+ This project uses the [Yarn Package Manager](https://yarnpkg.com/).
6
+
7
+ 1. `yarn` - Install project dependencies
8
+ 2. `yarn run dev` - Run the app with a development server that supports hot module reloading
9
+
10
+ NOTE: You will either need to provide the server URL via environment variable (you can use the `.env` file for this) or via a url param when you load the react app (example: `http://localhost:5173/?serverURL=localhost:8000`)
11
+
12
+ ## URL Parameters
13
+
14
+ You can provide URL parameters in order to change the behavior of the app. Those are documented in [URLParams.ts](src/URLParams.ts).
streaming-react-app/index.html ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8" />
5
+ <link rel="icon" type="image/svg+xml" href="/src/assets/seamless.svg" />
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0" />
7
+ <title>Seamless Translation</title>
8
+ </head>
9
+ <body>
10
+ <div id="root"></div>
11
+ <script type="module" src="/src/main.tsx"></script>
12
+ </body>
13
+ </html>
streaming-react-app/package-lock.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/package.json ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "streaming-react-app",
3
+ "private": true,
4
+ "version": "0.0.13",
5
+ "type": "module",
6
+ "scripts": {
7
+ "dev": "vite --host --strictPort",
8
+ "build": "tsc && vite build",
9
+ "preview": "vite preview",
10
+ "clean:node-modules": "rm -rf node_modules/",
11
+ "ts-check": "tsc --noEmit",
12
+ "lint": "eslint . --ext ts,tsx --report-unused-disable-directives --max-warnings 0",
13
+ "prettier-check": "cd ../ && yarn run prettier-base --check streaming-react-app",
14
+ "signal": "concurrently --names \"TS,LINT,PRETTIER\" -c \"bgBlack.bold,bgRed.bold,bgCyan.bold\" \"yarn run ts-check\" \"yarn run lint\" \"yarn run prettier-check\""
15
+ },
16
+ "dependencies": {
17
+ "@emotion/react": "11.11.1",
18
+ "@emotion/styled": "11.11.0",
19
+ "@mui/icons-material": "5.14.3",
20
+ "@mui/material": "5.14.5",
21
+ "@react-three/drei": "^9.83.9",
22
+ "@react-three/fiber": "^8.14.1",
23
+ "@react-three/xr": "^5.7.1",
24
+ "amazon-cognito-identity-js": "^6.3.6",
25
+ "audiobuffer-to-wav": "^1.0.0",
26
+ "aws-sdk": "^2.1472.0",
27
+ "iso-639-1": "^3.1.0",
28
+ "js-cookie": "^3.0.5",
29
+ "lodash": "4.17.21",
30
+ "react": "^18.2.0",
31
+ "react-dom": "^18.2.0",
32
+ "react-google-charts": "^4.0.1",
33
+ "socket.io-client": "^4.7.2",
34
+ "three": "^0.156.1",
35
+ "three-mesh-ui": "^6.5.4",
36
+ "uuid": "^9.0.0",
37
+ "zustand": "^4.4.3"
38
+ },
39
+ "devDependencies": {
40
+ "@types/node": "^20.5.3",
41
+ "@types/react": "^18.2.15",
42
+ "@types/react-dom": "^18.2.7",
43
+ "@types/uuid": "^9.0.2",
44
+ "@typescript-eslint/eslint-plugin": "^6.0.0",
45
+ "@typescript-eslint/parser": "^6.0.0",
46
+ "@vitejs/plugin-react": "^4.0.3",
47
+ "concurrently": "8.2.1",
48
+ "eslint": "^8.45.0",
49
+ "eslint-plugin-react-hooks": "^4.6.0",
50
+ "eslint-plugin-react-refresh": "^0.4.3",
51
+ "typescript": "5.1.6",
52
+ "vite": "^4.4.5"
53
+ }
54
+ }
streaming-react-app/src/App.tsx ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import SocketWrapper from './SocketWrapper';
2
+ import {ThemeProvider} from '@mui/material/styles';
3
+ import theme from './theme';
4
+ import StreamingInterface from './StreamingInterface';
5
+ import CssBaseline from '@mui/material/CssBaseline';
6
+ import {createContext, useCallback, useState} from 'react';
7
+ import packageJson from '../package.json';
8
+
9
+ console.log(`Streaming React App version: ${packageJson?.version}`);
10
+
11
+ // Roboto font for mui ui library
12
+ // import '@fontsource/roboto/300.css';
13
+ // import '@fontsource/roboto/400.css';
14
+ // import '@fontsource/roboto/500.css';
15
+ // import '@fontsource/roboto/700.css';
16
+
17
+ export const AppResetKeyContext = createContext<(newKey: string) => void>(
18
+ () => {
19
+ throw new Error('AppResetKeyContext not initialized');
20
+ },
21
+ );
22
+
23
+ function App() {
24
+ return (
25
+ <ThemeProvider theme={theme}>
26
+ <CssBaseline />
27
+ <SocketWrapper>
28
+ <StreamingInterface />
29
+ </SocketWrapper>
30
+ </ThemeProvider>
31
+ );
32
+ }
33
+
34
+ function AppWrapper() {
35
+ const [appResetKey, setAppResetKey] = useState<string>('[initial value]');
36
+ const setAppResetKeyHandler = useCallback((newKey: string) => {
37
+ setAppResetKey((prev) => {
38
+ console.warn(
39
+ `Resetting the app with appResetKey: ${newKey}; prevKey: ${prev}`,
40
+ );
41
+ if (prev === newKey) {
42
+ console.error(
43
+ `The appResetKey was the same as the previous key, so the app will not reset.`,
44
+ );
45
+ }
46
+ return newKey;
47
+ });
48
+ }, []);
49
+
50
+ return (
51
+ <AppResetKeyContext.Provider value={setAppResetKeyHandler}>
52
+ <App key={appResetKey} />
53
+ </AppResetKeyContext.Provider>
54
+ );
55
+ }
56
+
57
+ export default AppWrapper;
streaming-react-app/src/Blink.tsx ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Box from '@mui/material/Box';
2
+ import {useEffect, useState} from 'react';
3
+
4
+ type Props = {
5
+ intervalMs: number;
6
+ children: React.ReactNode;
7
+ shouldBlink: boolean;
8
+ // display?: 'block' | 'inline' | 'inline-block';
9
+ };
10
+
11
+ export default function Blink({
12
+ // display = 'inline-block',
13
+ shouldBlink,
14
+ intervalMs,
15
+ children,
16
+ }: Props): React.ReactElement {
17
+ const [cursorBlinkOn, setCursorBlinkOn] = useState(false);
18
+
19
+ useEffect(() => {
20
+ if (shouldBlink) {
21
+ const interval = setInterval(() => {
22
+ setCursorBlinkOn((prev) => !prev);
23
+ }, intervalMs);
24
+
25
+ return () => clearInterval(interval);
26
+ } else {
27
+ setCursorBlinkOn(false);
28
+ }
29
+ }, [intervalMs, shouldBlink]);
30
+
31
+ return (
32
+ <Box
33
+ component="span"
34
+ sx={{
35
+ display: 'inline-block',
36
+ visibility: cursorBlinkOn ? 'visible' : 'hidden',
37
+ }}>
38
+ {children}
39
+ </Box>
40
+ );
41
+ }
streaming-react-app/src/DebugSection.tsx ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {Chart} from 'react-google-charts';
2
+ import debug from './debug';
3
+ import {
4
+ Accordion,
5
+ AccordionDetails,
6
+ AccordionSummary,
7
+ Button,
8
+ Typography,
9
+ } from '@mui/material';
10
+ import {useState} from 'react';
11
+ import ArrowDropDownIcon from '@mui/icons-material/ArrowDropDown';
12
+
13
+ export default function DebugChart() {
14
+ const [showDebugTimings, setShowDebugTimings] = useState<boolean>(false);
15
+
16
+ const data = debug()?.getChartData();
17
+ const options = {
18
+ timeline: {
19
+ groupByRowLabel: true,
20
+ },
21
+ };
22
+
23
+ return (
24
+ <div className="horizontal-padding-sra text-chunk-sra">
25
+ <Accordion
26
+ expanded={showDebugTimings}
27
+ onChange={() => setShowDebugTimings(!showDebugTimings)}
28
+ elevation={0}
29
+ sx={{border: 1, borderColor: 'rgba(0, 0, 0, 0.3)'}}>
30
+ <AccordionSummary
31
+ expandIcon={<ArrowDropDownIcon />}
32
+ className="debug-section">
33
+ Debug Info
34
+ </AccordionSummary>
35
+ <AccordionDetails>
36
+ {data && data.length > 1 ? (
37
+ <>
38
+ <Chart
39
+ chartType="Timeline"
40
+ data={data}
41
+ width="100%"
42
+ height="400px"
43
+ options={options}
44
+ />
45
+ <Button
46
+ variant="contained"
47
+ sx={{marginBottom: 1}}
48
+ onClick={() => {
49
+ debug()?.downloadInputAudio();
50
+ debug()?.downloadOutputAudio();
51
+ }}>
52
+ Download Input / Ouput Audio
53
+ </Button>
54
+ </>
55
+ ) : (
56
+ <Typography>No input / output detected</Typography>
57
+ )}
58
+ </AccordionDetails>
59
+ </Accordion>
60
+ </div>
61
+ );
62
+ }
streaming-react-app/src/RoomConfig.tsx ADDED
@@ -0,0 +1,262 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import Stack from '@mui/material/Stack';
2
+ import TextField from '@mui/material/TextField';
3
+ import {isValidRoomID, isValidPartialRoomID} from './generateNewRoomID';
4
+ import {useCallback, useEffect, useState} from 'react';
5
+ import Button from '@mui/material/Button';
6
+ import {useSocket} from './useSocket';
7
+ import FormGroup from '@mui/material/FormGroup';
8
+ import FormControlLabel from '@mui/material/FormControlLabel';
9
+ import Checkbox from '@mui/material/Checkbox';
10
+ import {RoomState} from './types/RoomState';
11
+ import setURLParam from './setURLParam';
12
+ import {getURLParams} from './URLParams';
13
+ import {
14
+ JoinRoomConfig,
15
+ Roles,
16
+ ServerState,
17
+ StreamingStatus,
18
+ } from './types/StreamingTypes';
19
+ import Alert from '@mui/material/Alert';
20
+
21
+ function capitalize(str: string): string {
22
+ return str.charAt(0).toUpperCase() + str.slice(1);
23
+ }
24
+
25
+ type Props = {
26
+ roomState: RoomState | null;
27
+ serverState: ServerState | null;
28
+ onJoinRoomOrUpdateRoles?: () => void;
29
+ streamingStatus: StreamingStatus;
30
+ };
31
+
32
+ export default function RoomConfig({
33
+ roomState,
34
+ serverState,
35
+ onJoinRoomOrUpdateRoles,
36
+ streamingStatus,
37
+ }: Props) {
38
+ const {socket, clientID} = useSocket();
39
+
40
+ const urlParams = getURLParams();
41
+ const roomIDParam = urlParams.roomID;
42
+ const autoJoinRoom = urlParams.autoJoin;
43
+
44
+ const [roomID, setRoomID] = useState<string>(
45
+ (roomIDParam ?? '').toUpperCase(),
46
+ );
47
+ const [roomIDError, setRoomIDError] = useState<boolean>(false);
48
+ const [roles, setRoles] = useState<{speaker: boolean; listener: boolean}>({
49
+ speaker: true,
50
+ listener: true,
51
+ });
52
+ const [lockServer, setLockServer] = useState<boolean>(false);
53
+ const [lockServerName, setLockServerName] = useState<string>('');
54
+
55
+ const [joinInProgress, setJoinInProgress] = useState<boolean>(false);
56
+ const [didAttemptAutoJoin, setDidAttemptAutoJoin] = useState<boolean>(false);
57
+
58
+ const isValidServerLock =
59
+ lockServer === false ||
60
+ (lockServerName != null && lockServerName.length > 0);
61
+ const isValidRoles = Object.values(roles).filter(Boolean).length > 0;
62
+ const isValidAllInputs =
63
+ isValidRoomID(roomID) && isValidRoles && isValidServerLock;
64
+ const roomIDFromServer = roomState?.room_id ?? null;
65
+
66
+ const onJoinRoom = useCallback(
67
+ (createNewRoom: boolean) => {
68
+ if (socket == null) {
69
+ console.error('Socket is null, cannot join room');
70
+ return;
71
+ }
72
+ console.debug(`Attempting to join roomID ${roomID}...`);
73
+
74
+ const lockServerValidated: string | null =
75
+ lockServer && roles['speaker'] ? lockServerName : null;
76
+
77
+ setJoinInProgress(true);
78
+
79
+ const configObject: JoinRoomConfig = {
80
+ roles: (Object.keys(roles) as Array<Roles>).filter(
81
+ (role) => roles[role] === true,
82
+ ),
83
+ lockServerName: lockServerValidated,
84
+ };
85
+
86
+ socket.emit(
87
+ 'join_room',
88
+ clientID,
89
+ createNewRoom ? null : roomID,
90
+ configObject,
91
+ (result) => {
92
+ console.log('join_room result:', result);
93
+ if (createNewRoom) {
94
+ setRoomID(result.roomID);
95
+ }
96
+ if (onJoinRoomOrUpdateRoles != null) {
97
+ onJoinRoomOrUpdateRoles();
98
+ }
99
+ setURLParam('roomID', result.roomID);
100
+ setJoinInProgress(false);
101
+ },
102
+ );
103
+ },
104
+ [
105
+ clientID,
106
+ lockServer,
107
+ lockServerName,
108
+ onJoinRoomOrUpdateRoles,
109
+ roles,
110
+ roomID,
111
+ socket,
112
+ ],
113
+ );
114
+
115
+ useEffect(() => {
116
+ if (
117
+ autoJoinRoom === true &&
118
+ didAttemptAutoJoin === false &&
119
+ socket != null
120
+ ) {
121
+ // We want to consider this an attempt whether or not we actually try to join, because
122
+ // we only want auto-join to happen on initial load
123
+ setDidAttemptAutoJoin(true);
124
+ if (
125
+ isValidAllInputs &&
126
+ joinInProgress === false &&
127
+ roomIDFromServer == null
128
+ ) {
129
+ console.debug('Attempting to auto-join room...');
130
+
131
+ onJoinRoom(false);
132
+ } else {
133
+ console.debug('Unable to auto-join room', {
134
+ isValidAllInputs,
135
+ joinInProgress,
136
+ roomIDFromServer,
137
+ });
138
+ }
139
+ }
140
+ }, [
141
+ autoJoinRoom,
142
+ didAttemptAutoJoin,
143
+ isValidAllInputs,
144
+ joinInProgress,
145
+ onJoinRoom,
146
+ roomIDFromServer,
147
+ socket,
148
+ ]);
149
+
150
+ return (
151
+ <Stack direction="column" spacing="12px">
152
+ <Stack direction="row" spacing="12px" sx={{alignItems: 'center'}}>
153
+ <TextField
154
+ size="small"
155
+ label="Room Code"
156
+ variant="outlined"
157
+ disabled={roomState?.room_id != null}
158
+ value={roomID}
159
+ error={roomIDError}
160
+ onChange={(e) => {
161
+ const id = e.target.value.toUpperCase();
162
+ if (isValidPartialRoomID(id)) {
163
+ setRoomIDError(false);
164
+ setRoomID(id);
165
+ } else {
166
+ setRoomIDError(true);
167
+ }
168
+ }}
169
+ sx={{width: '8em'}}
170
+ />
171
+
172
+ <div>
173
+ <Button
174
+ variant="contained"
175
+ disabled={
176
+ isValidAllInputs === false ||
177
+ joinInProgress ||
178
+ streamingStatus !== 'stopped'
179
+ }
180
+ onClick={() => onJoinRoom(false)}>
181
+ {roomState?.room_id != null ? 'Update Roles' : 'Join Room'}
182
+ </Button>
183
+ </div>
184
+
185
+ {roomState?.room_id == null && (
186
+ <div>
187
+ <Button
188
+ variant="contained"
189
+ disabled={
190
+ roomState?.room_id != null ||
191
+ joinInProgress ||
192
+ streamingStatus !== 'stopped'
193
+ }
194
+ onClick={() => onJoinRoom(true)}>
195
+ {'Create New Room'}
196
+ </Button>
197
+ </div>
198
+ )}
199
+ </Stack>
200
+
201
+ <FormGroup>
202
+ {Object.keys(roles).map((role) => {
203
+ return (
204
+ <FormControlLabel
205
+ disabled={streamingStatus !== 'stopped'}
206
+ key={role}
207
+ control={
208
+ <Checkbox
209
+ checked={roles[role]}
210
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
211
+ setRoles((prevRoles) => ({
212
+ ...prevRoles,
213
+ [role]: event.target.checked,
214
+ }));
215
+ }}
216
+ />
217
+ }
218
+ label={capitalize(role)}
219
+ />
220
+ );
221
+ })}
222
+
223
+ {urlParams.enableServerLock && roles['speaker'] === true && (
224
+ <>
225
+ <FormControlLabel
226
+ disabled={streamingStatus !== 'stopped'}
227
+ control={
228
+ <Checkbox
229
+ checked={lockServer}
230
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
231
+ setLockServer(event.target.checked);
232
+ }}
233
+ />
234
+ }
235
+ label="Lock Server (prevent other users from streaming)"
236
+ />
237
+ </>
238
+ )}
239
+ </FormGroup>
240
+
241
+ {urlParams.enableServerLock &&
242
+ roles['speaker'] === true &&
243
+ lockServer && (
244
+ <TextField
245
+ disabled={streamingStatus !== 'stopped'}
246
+ label="Enter Your Name + Expected Lock End Time"
247
+ variant="outlined"
248
+ value={lockServerName}
249
+ onChange={(event: React.ChangeEvent<HTMLInputElement>) => {
250
+ setLockServerName(event.target.value);
251
+ }}
252
+ helperText="Locking the server will prevent anyone else from using it until you close the page, in order to maximize server performance. Please only use this for live demos."
253
+ />
254
+ )}
255
+
256
+ {serverState?.serverLock != null &&
257
+ serverState.serverLock.clientID === clientID && (
258
+ <Alert severity="success">{`The server is now locked for your use (${serverState?.serverLock?.name}). Close this window to release the lock so that others may use the server.`}</Alert>
259
+ )}
260
+ </Stack>
261
+ );
262
+ }
streaming-react-app/src/SocketWrapper.tsx ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useContext, useEffect, useMemo, useRef, useState} from 'react';
2
+ import socketIOClient, {Socket} from 'socket.io-client';
3
+ import useStable from './useStable';
4
+ import {v4 as uuidv4} from 'uuid';
5
+ import {SocketContext} from './useSocket';
6
+ import {AppResetKeyContext} from './App';
7
+ import Backdrop from '@mui/material/Backdrop';
8
+ import CircularProgress from '@mui/material/CircularProgress';
9
+ import Typography from '@mui/material/Typography';
10
+ import {getURLParams} from './URLParams';
11
+
12
+ // The time to wait before showing a "disconnected" screen upon initial app load
13
+ const INITIAL_DISCONNECT_SCREEN_DELAY = 2000;
14
+ const SERVER_URL_DEFAULT = `${window.location.protocol === "https:" ? "wss" : "ws"
15
+ }://${window.location.host}`;
16
+
17
+ export default function SocketWrapper({children}) {
18
+ const [socket, setSocket] = useState<Socket | null>(null);
19
+ const [connected, setConnected] = useState<boolean | null>(null);
20
+ // Default to true:
21
+ const [willAttemptReconnect] = useState<boolean>(true);
22
+ const serverIDRef = useRef<string | null>(null);
23
+
24
+ const setAppResetKey = useContext(AppResetKeyContext);
25
+
26
+ /**
27
+ * Previously we had stored the clientID in local storage, but in that case
28
+ * if a user refreshes their page they'll still have the same clientID, and
29
+ * will be put back into the same room, which may be confusing if they're trying
30
+ * to join a new room or reset the app interface. So now clientIDs persist only as
31
+ * long as the react app full lifecycle
32
+ */
33
+ const clientID = useStable<string>(() => {
34
+ const newID = uuidv4();
35
+ // Set the clientID in session storage so if the page reloads the person
36
+ // still retains their member/room config
37
+ return newID;
38
+ });
39
+
40
+ const socketObject = useMemo(
41
+ () => ({socket, clientID, connected: connected ?? false}),
42
+ [socket, clientID, connected],
43
+ );
44
+
45
+ useEffect(() => {
46
+ const queryParams = {
47
+ clientID: clientID,
48
+ };
49
+
50
+ const serverURLFromParams = getURLParams().serverURL;
51
+ const serverURL = serverURLFromParams ?? SERVER_URL_DEFAULT;
52
+
53
+ console.log(
54
+ `Opening socket connection to ${
55
+ serverURL?.length === 0 ? 'window.location.host' : serverURL
56
+ } with query params:`,
57
+ queryParams,
58
+ );
59
+
60
+ const newSocket: Socket = socketIOClient(serverURL, {
61
+ query: queryParams,
62
+ // Normally socket.io will fallback to http polling, but we basically never
63
+ // want that because that'd mean awful performance. It'd be better for the app
64
+ // to simply break in that case and not connect.
65
+ transports: ['websocket'],
66
+ path: '/ws/socket.io'
67
+ });
68
+
69
+ const onServerID = (serverID: string) => {
70
+ console.debug('Received server ID:', serverID);
71
+ if (serverIDRef.current != null) {
72
+ if (serverIDRef.current !== serverID) {
73
+ console.error(
74
+ 'Server ID changed. Resetting the app using the app key',
75
+ );
76
+ setAppResetKey(serverID);
77
+ }
78
+ }
79
+ serverIDRef.current = serverID;
80
+ };
81
+
82
+ newSocket.on('server_id', onServerID);
83
+
84
+ setSocket(newSocket);
85
+
86
+ return () => {
87
+ newSocket.off('server_id', onServerID);
88
+ console.log(
89
+ 'Closing socket connection in the useEffect cleanup function...',
90
+ );
91
+ newSocket.disconnect();
92
+ setSocket(null);
93
+ };
94
+ }, [clientID, setAppResetKey]);
95
+
96
+ useEffect(() => {
97
+ if (socket != null) {
98
+ const onAny = (eventName: string, ...args) => {
99
+ console.debug(`[event: ${eventName}] args:`, ...args);
100
+ };
101
+
102
+ socket.onAny(onAny);
103
+
104
+ return () => {
105
+ socket.offAny(onAny);
106
+ };
107
+ }
108
+ return () => {};
109
+ }, [socket]);
110
+
111
+ useEffect(() => {
112
+ if (socket != null) {
113
+ const onConnect = (...args) => {
114
+ console.debug('Connected to server with args:', ...args);
115
+ setConnected(true);
116
+ };
117
+
118
+ const onConnectError = (err) => {
119
+ console.error(`Connection error due to ${err.message}`);
120
+ };
121
+
122
+ const onDisconnect = (reason) => {
123
+ setConnected(false);
124
+ console.log(`Disconnected due to ${reason}`);
125
+ };
126
+
127
+ socket.on('connect', onConnect);
128
+ socket.on('connect_error', onConnectError);
129
+ socket.on('disconnect', onDisconnect);
130
+
131
+ return () => {
132
+ socket.off('connect', onConnect);
133
+ socket.off('connect_error', onConnectError);
134
+ socket.off('disconnect', onDisconnect);
135
+ };
136
+ }
137
+ }, [socket]);
138
+
139
+ useEffect(() => {
140
+ if (socket != null) {
141
+ const onReconnectError = (err) => {
142
+ console.log(`Reconnect error due to ${err.message}`);
143
+ };
144
+
145
+ socket.io.on('reconnect_error', onReconnectError);
146
+
147
+ const onError = (err) => {
148
+ console.log(`General socket error with message ${err.message}`);
149
+ };
150
+ socket.io.on('error', onError);
151
+
152
+ const onReconnect = (attempt) => {
153
+ console.log(`Reconnected after ${attempt} attempt(s)`);
154
+ };
155
+ socket.io.on('reconnect', onReconnect);
156
+
157
+ const disconnectOnBeforeUnload = () => {
158
+ console.log('Disconnecting due to beforeunload event...');
159
+ socket.disconnect();
160
+ setSocket(null);
161
+ };
162
+ window.addEventListener('beforeunload', disconnectOnBeforeUnload);
163
+
164
+ return () => {
165
+ socket.io.off('reconnect_error', onReconnectError);
166
+ socket.io.off('error', onError);
167
+ socket.io.off('reconnect', onReconnect);
168
+ window.removeEventListener('beforeunload', disconnectOnBeforeUnload);
169
+ };
170
+ }
171
+ }, [clientID, setAppResetKey, socket]);
172
+
173
+ /**
174
+ * Wait to show the disconnected screen on initial app load
175
+ */
176
+ useEffect(() => {
177
+ window.setTimeout(() => {
178
+ setConnected((prev) => {
179
+ if (prev === null) {
180
+ return false;
181
+ }
182
+ return prev;
183
+ });
184
+ }, INITIAL_DISCONNECT_SCREEN_DELAY);
185
+ }, []);
186
+
187
+ return (
188
+ <SocketContext.Provider value={socketObject}>
189
+ {children}
190
+
191
+ <Backdrop
192
+ open={connected === false && willAttemptReconnect === true}
193
+ sx={{
194
+ color: '#fff',
195
+ zIndex: (theme) => theme.zIndex.drawer + 1,
196
+ }}>
197
+ <div
198
+ style={{
199
+ alignItems: 'center',
200
+ flexDirection: 'column',
201
+ textAlign: 'center',
202
+ }}>
203
+ <CircularProgress color="inherit" />
204
+ <Typography
205
+ align="center"
206
+ fontSize={{sm: 18, xs: 16}}
207
+ sx={{
208
+ fontFamily:
209
+ 'ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace',
210
+ fontWeight: 'bold',
211
+ }}>
212
+ {'Disconnected. Attempting to reconnect...'}
213
+ </Typography>
214
+ </div>
215
+ </Backdrop>
216
+ </SocketContext.Provider>
217
+ );
218
+ }
streaming-react-app/src/StreamingInterface.css ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .app-wrapper-sra {
2
+ display: flex;
3
+ flex-direction: column;
4
+ justify-content: center;
5
+ align-items: center;
6
+ }
7
+
8
+ .main-container-sra {
9
+ background-color: white;
10
+ display: flex;
11
+ flex-direction: column;
12
+ justify-content: flex-start;
13
+ text-align: left;
14
+ margin: 16px;
15
+ margin-bottom: 36px;
16
+ border-radius: 8px;
17
+ box-shadow: 0px 24px 30px rgba(0, 0, 0, 0.3);
18
+ border: 1px solid rgba(0, 0, 0, 0.05);
19
+ overflow: hidden;
20
+ }
21
+
22
+ .top-section-sra {
23
+ padding-top: 24px;
24
+ margin-bottom: 24px;
25
+ display: flex;
26
+ flex-direction: column;
27
+ justify-content: flex-start;
28
+ }
29
+
30
+ .horizontal-padding-sra {
31
+ padding-left: 20px;
32
+ padding-right: 20px;
33
+ }
34
+
35
+ .header-container-sra {
36
+ display: flex;
37
+ flex-direction: row;
38
+ justify-content: flex-start;
39
+ align-items: center;
40
+ margin-bottom: 24px;
41
+ }
42
+
43
+ .header-icon-sra {
44
+ display: block;
45
+ margin-right: 12px;
46
+ }
47
+
48
+ .translation-text-container-sra {
49
+ background-color: #f8f8f8;
50
+ padding-top: 12px;
51
+ padding-bottom: 4px;
52
+ }
53
+
54
+ .text-chunk-sra {
55
+ margin-bottom: 12px;
56
+ }
streaming-react-app/src/StreamingInterface.tsx ADDED
@@ -0,0 +1,1165 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useCallback, useEffect, useLayoutEffect, useRef, useState} from 'react';
2
+ import Button from '@mui/material/Button';
3
+ import Typography from '@mui/material/Typography';
4
+ import InputLabel from '@mui/material/InputLabel';
5
+ import FormControl from '@mui/material/FormControl';
6
+ import Select, {SelectChangeEvent} from '@mui/material/Select';
7
+ import MenuItem from '@mui/material/MenuItem';
8
+ import Stack from '@mui/material/Stack';
9
+ import seamlessLogoUrl from './assets/seamless.svg';
10
+ import {
11
+ AgentCapabilities,
12
+ BaseResponse,
13
+ BrowserAudioStreamConfig,
14
+ DynamicConfig,
15
+ PartialDynamicConfig,
16
+ SUPPORTED_INPUT_SOURCES,
17
+ SUPPORTED_OUTPUT_MODES,
18
+ ServerExceptionData,
19
+ ServerSpeechData,
20
+ ServerState,
21
+ ServerTextData,
22
+ StartStreamEventConfig,
23
+ StreamingStatus,
24
+ SupportedInputSource,
25
+ SupportedOutputMode,
26
+ TranslationSentences,
27
+ } from './types/StreamingTypes';
28
+ import FormLabel from '@mui/material/FormLabel';
29
+ import RadioGroup from '@mui/material/RadioGroup';
30
+ import FormControlLabel from '@mui/material/FormControlLabel';
31
+ import Radio from '@mui/material/Radio';
32
+ import './StreamingInterface.css';
33
+ import RoomConfig from './RoomConfig';
34
+ import Divider from '@mui/material/Divider';
35
+ import {useSocket} from './useSocket';
36
+ import {RoomState} from './types/RoomState';
37
+ import useStable from './useStable';
38
+ import float32To16BitPCM from './float32To16BitPCM';
39
+ import createBufferedSpeechPlayer from './createBufferedSpeechPlayer';
40
+ import Checkbox from '@mui/material/Checkbox';
41
+ import Alert from '@mui/material/Alert';
42
+ import isScrolledToDocumentBottom from './isScrolledToDocumentBottom';
43
+ import Box from '@mui/material/Box';
44
+ import Slider from '@mui/material/Slider';
45
+ import VolumeDown from '@mui/icons-material/VolumeDown';
46
+ import VolumeUp from '@mui/icons-material/VolumeUp';
47
+ import Mic from '@mui/icons-material/Mic';
48
+ import MicOff from '@mui/icons-material/MicOff';
49
+ import XRDialog from './react-xr/XRDialog';
50
+ import getTranslationSentencesFromReceivedData from './getTranslationSentencesFromReceivedData';
51
+ import {
52
+ sliceTranslationSentencesUpToIndex,
53
+ getTotalSentencesLength,
54
+ } from './sliceTranslationSentencesUtils';
55
+ import Blink from './Blink';
56
+ import {CURSOR_BLINK_INTERVAL_MS} from './cursorBlinkInterval';
57
+ import {getURLParams} from './URLParams';
58
+ import debug from './debug';
59
+ import DebugSection from './DebugSection';
60
+ import {Grid} from '@mui/material';
61
+ import {getLanguageFromThreeLetterCode} from './languageLookup';
62
+
63
+ const AUDIO_STREAM_DEFAULTS: {
64
+ [key in SupportedInputSource]: BrowserAudioStreamConfig;
65
+ } = {
66
+ userMedia: {
67
+ echoCancellation: false,
68
+ noiseSuppression: true,
69
+ },
70
+ displayMedia: {
71
+ echoCancellation: false,
72
+ noiseSuppression: false,
73
+ },
74
+ };
75
+
76
+ async function requestUserMediaAudioStream(
77
+ config: BrowserAudioStreamConfig = {
78
+ echoCancellation: false,
79
+ noiseSuppression: true,
80
+ },
81
+ ) {
82
+ const stream = await navigator.mediaDevices.getUserMedia({
83
+ audio: {...config, channelCount: 1},
84
+ });
85
+ console.debug(
86
+ '[requestUserMediaAudioStream] stream created with settings:',
87
+ stream.getAudioTracks()?.[0]?.getSettings(),
88
+ );
89
+ return stream;
90
+ }
91
+
92
+ async function requestDisplayMediaAudioStream(
93
+ config: BrowserAudioStreamConfig = {
94
+ echoCancellation: false,
95
+ noiseSuppression: false,
96
+ },
97
+ ) {
98
+ const stream = await navigator.mediaDevices.getDisplayMedia({
99
+ audio: {...config, channelCount: 1},
100
+ });
101
+ console.debug(
102
+ '[requestDisplayMediaAudioStream] stream created with settings:',
103
+ stream.getAudioTracks()?.[0]?.getSettings(),
104
+ );
105
+ return stream;
106
+ }
107
+
108
+ const buttonLabelMap: {[key in StreamingStatus]: string} = {
109
+ stopped: 'Start Streaming',
110
+ running: 'Stop Streaming',
111
+ starting: 'Starting...',
112
+ };
113
+
114
+ const BUFFER_LIMIT = 1;
115
+
116
+ const SCROLLED_TO_BOTTOM_THRESHOLD_PX = 36;
117
+
118
+ const GAIN_MULTIPLIER_OVER_1 = 3;
119
+
120
+ const getGainScaledValue = (value) =>
121
+ value > 1 ? (value - 1) * GAIN_MULTIPLIER_OVER_1 + 1 : value;
122
+
123
+ const TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD = 2;
124
+
125
+ const MAX_SERVER_EXCEPTIONS_TRACKED = 500;
126
+
127
+ export const TYPING_ANIMATION_DELAY_MS = 6;
128
+
129
+ export default function StreamingInterface() {
130
+ const urlParams = getURLParams();
131
+ const debugParam = urlParams.debug;
132
+ const [animateTextDisplay, setAnimateTextDisplay] = useState<boolean>(
133
+ urlParams.animateTextDisplay,
134
+ );
135
+
136
+ const socketObject = useSocket();
137
+ const {socket, clientID} = socketObject;
138
+
139
+ const [serverState, setServerState] = useState<ServerState | null>(null);
140
+ const [agent, setAgent] = useState<AgentCapabilities | null>(null);
141
+ const model = agent?.name ?? null;
142
+ const agentsCapabilities: Array<AgentCapabilities> =
143
+ serverState?.agentsCapabilities ?? [];
144
+ const currentAgent: AgentCapabilities | null =
145
+ agentsCapabilities.find((agent) => agent.name === model) ?? null;
146
+
147
+ const [serverExceptions, setServerExceptions] = useState<
148
+ Array<ServerExceptionData>
149
+ >([]);
150
+ const [roomState, setRoomState] = useState<RoomState | null>(null);
151
+ const roomID = roomState?.room_id ?? null;
152
+ const isSpeaker =
153
+ (clientID != null && roomState?.speakers.includes(clientID)) ?? false;
154
+ const isListener =
155
+ (clientID != null && roomState?.listeners.includes(clientID)) ?? false;
156
+
157
+ const [streamingStatus, setStreamingStatus] =
158
+ useState<StreamingStatus>('stopped');
159
+
160
+ const isStreamConfiguredRef = useRef<boolean>(false);
161
+
162
+ const [outputMode, setOutputMode] = useState<SupportedOutputMode>('s2s&t');
163
+ const [inputSource, setInputSource] =
164
+ useState<SupportedInputSource>('userMedia');
165
+ const [enableNoiseSuppression, setEnableNoiseSuppression] = useState<
166
+ boolean | null
167
+ >(null);
168
+ const [enableEchoCancellation, setEnableEchoCancellation] = useState<
169
+ boolean | null
170
+ >(null);
171
+
172
+ // Dynamic Params:
173
+ const [targetLang, setTargetLang] = useState<string | null>(null);
174
+
175
+ const [serverDebugFlag, setServerDebugFlag] = useState<boolean>(
176
+ debugParam ?? false,
177
+ );
178
+
179
+ const [receivedData, setReceivedData] = useState<Array<ServerTextData>>([]);
180
+ const [
181
+ translationSentencesAnimatedIndex,
182
+ setTranslationSentencesAnimatedIndex,
183
+ ] = useState<number>(0);
184
+
185
+ const lastTranslationResultRef = useRef<HTMLDivElement | null>(null);
186
+
187
+ const [inputStream, setInputStream] = useState<MediaStream | null>(null);
188
+ const [inputStreamSource, setInputStreamSource] =
189
+ useState<MediaStreamAudioSourceNode | null>(null);
190
+ const audioContext = useStable<AudioContext>(() => new AudioContext());
191
+ const [scriptNodeProcessor, setScriptNodeProcessor] =
192
+ useState<ScriptProcessorNode | null>(null);
193
+
194
+ const [muted, setMuted] = useState<boolean>(false);
195
+ // The onaudioprocess script needs an up-to-date reference to the muted state, so
196
+ // we use a ref here and keep it in sync via useEffect
197
+ const mutedRef = useRef<boolean>(muted);
198
+ useEffect(() => {
199
+ mutedRef.current = muted;
200
+ }, [muted]);
201
+
202
+ const [gain, setGain] = useState<number>(1);
203
+
204
+ const isScrolledToBottomRef = useRef<boolean>(isScrolledToDocumentBottom());
205
+
206
+ // Some config options must be set when starting streaming and cannot be chaned dynamically.
207
+ // This controls whether they are disabled or not
208
+ const streamFixedConfigOptionsDisabled =
209
+ streamingStatus !== 'stopped' || roomID == null;
210
+
211
+ const bufferedSpeechPlayer = useStable(() => {
212
+ const player = createBufferedSpeechPlayer({
213
+ onStarted: () => {
214
+ console.debug('📢 PLAYBACK STARTED 📢');
215
+ },
216
+ onEnded: () => {
217
+ console.debug('🛑 PLAYBACK ENDED 🛑');
218
+ },
219
+ });
220
+
221
+ // Start the player now so it eagerly plays audio when it arrives
222
+ player.start();
223
+ return player;
224
+ });
225
+
226
+ const translationSentencesBase: TranslationSentences =
227
+ getTranslationSentencesFromReceivedData(receivedData);
228
+
229
+ const translationSentencesBaseTotalLength = getTotalSentencesLength(
230
+ translationSentencesBase,
231
+ );
232
+
233
+ const translationSentences: TranslationSentences = animateTextDisplay
234
+ ? sliceTranslationSentencesUpToIndex(
235
+ translationSentencesBase,
236
+ translationSentencesAnimatedIndex,
237
+ )
238
+ : translationSentencesBase;
239
+
240
+ // We want the blinking cursor to show before any text has arrived, so let's add an empty string so that the cursor shows up
241
+ const translationSentencesWithEmptyStartingString =
242
+ streamingStatus === 'running' && translationSentences.length === 0
243
+ ? ['']
244
+ : translationSentences;
245
+
246
+ /******************************************
247
+ * Event Handlers
248
+ ******************************************/
249
+
250
+ const setAgentAndUpdateParams = useCallback(
251
+ (newAgent: AgentCapabilities | null) => {
252
+ setAgent((prevAgent) => {
253
+ if (prevAgent?.name !== newAgent?.name) {
254
+ setTargetLang(newAgent?.targetLangs[0] ?? null);
255
+ }
256
+ return newAgent;
257
+ });
258
+ },
259
+ [],
260
+ );
261
+
262
+ const onSetDynamicConfig = useCallback(
263
+ async (partialConfig: PartialDynamicConfig) => {
264
+ return new Promise<void>((resolve, reject) => {
265
+ if (socket == null) {
266
+ reject(new Error('[onSetDynamicConfig] socket is null '));
267
+ return;
268
+ }
269
+
270
+ socket.emit(
271
+ 'set_dynamic_config',
272
+ partialConfig,
273
+ (result: BaseResponse) => {
274
+ console.log('[emit result: set_dynamic_config]', result);
275
+ if (result.status === 'ok') {
276
+ resolve();
277
+ } else {
278
+ reject();
279
+ }
280
+ },
281
+ );
282
+ });
283
+ },
284
+ [socket],
285
+ );
286
+
287
+ const configureStreamAsync = ({sampleRate}: {sampleRate: number}) => {
288
+ return new Promise<void>((resolve, reject) => {
289
+ if (socket == null) {
290
+ reject(new Error('[configureStreamAsync] socket is null '));
291
+ return;
292
+ }
293
+ const modelName = agent?.name ?? null;
294
+ if (modelName == null) {
295
+ reject(new Error('[configureStreamAsync] modelName is null '));
296
+ return;
297
+ }
298
+
299
+ const config: StartStreamEventConfig = {
300
+ event: 'config',
301
+ rate: sampleRate,
302
+ model_name: modelName,
303
+ debug: serverDebugFlag,
304
+ // synchronous processing isn't implemented on the v2 pubsub server, so hardcode this to true
305
+ async_processing: true,
306
+ buffer_limit: BUFFER_LIMIT,
307
+ model_type: outputMode,
308
+ };
309
+
310
+ console.log('[configureStreamAsync] sending config', config);
311
+
312
+ socket.emit('configure_stream', config, (statusObject) => {
313
+ if (statusObject.status === 'ok') {
314
+ isStreamConfiguredRef.current = true;
315
+ console.debug(
316
+ '[configureStreamAsync] stream configured!',
317
+ statusObject,
318
+ );
319
+ resolve();
320
+ } else {
321
+ isStreamConfiguredRef.current = false;
322
+ reject(
323
+ new Error(
324
+ `[configureStreamAsync] configure_stream returned status: ${statusObject.status}`,
325
+ ),
326
+ );
327
+ return;
328
+ }
329
+ });
330
+ });
331
+ };
332
+
333
+ const startStreaming = async () => {
334
+ if (streamingStatus !== 'stopped') {
335
+ console.warn(
336
+ `Attempting to start stream when status is ${streamingStatus}`,
337
+ );
338
+ return;
339
+ }
340
+
341
+ setStreamingStatus('starting');
342
+
343
+ if (audioContext.state === 'suspended') {
344
+ console.warn('audioContext was suspended! resuming...');
345
+ await audioContext.resume();
346
+ }
347
+
348
+ let stream: MediaStream | null = null;
349
+
350
+ try {
351
+ if (inputSource === 'userMedia') {
352
+ stream = await requestUserMediaAudioStream({
353
+ noiseSuppression:
354
+ enableNoiseSuppression ??
355
+ AUDIO_STREAM_DEFAULTS['userMedia'].noiseSuppression,
356
+ echoCancellation:
357
+ enableEchoCancellation ??
358
+ AUDIO_STREAM_DEFAULTS['userMedia'].echoCancellation,
359
+ });
360
+ } else if (inputSource === 'displayMedia') {
361
+ stream = await requestDisplayMediaAudioStream({
362
+ noiseSuppression:
363
+ enableNoiseSuppression ??
364
+ AUDIO_STREAM_DEFAULTS['displayMedia'].noiseSuppression,
365
+ echoCancellation:
366
+ enableEchoCancellation ??
367
+ AUDIO_STREAM_DEFAULTS['displayMedia'].echoCancellation,
368
+ });
369
+ } else {
370
+ throw new Error(`Unsupported input source requested: ${inputSource}`);
371
+ }
372
+ setInputStream(stream);
373
+ } catch (e) {
374
+ console.error('[startStreaming] media stream request failed:', e);
375
+ setStreamingStatus('stopped');
376
+ return;
377
+ }
378
+
379
+ const mediaStreamSource = audioContext.createMediaStreamSource(stream);
380
+ setInputStreamSource(mediaStreamSource);
381
+ /**
382
+ * NOTE: This currently uses a deprecated way of processing the audio (createScriptProcessor), but
383
+ * which is easy and convenient for our purposes.
384
+ *
385
+ * Documentation for the deprecated way of doing it is here: https://developer.mozilla.org/en-US/docs/Web/API/BaseAudioContext/createScriptProcessor
386
+ *
387
+ * In an ideal world this would be migrated to something like this SO answer: https://stackoverflow.com/a/65448287
388
+ */
389
+ const scriptProcessor = audioContext.createScriptProcessor(16384, 1, 1);
390
+ setScriptNodeProcessor(scriptProcessor);
391
+
392
+ scriptProcessor.onaudioprocess = (event) => {
393
+ if (isStreamConfiguredRef.current === false) {
394
+ console.debug('[onaudioprocess] stream is not configured yet!');
395
+ return;
396
+ }
397
+ if (socket == null) {
398
+ console.warn('[onaudioprocess] socket is null in onaudioprocess');
399
+ return;
400
+ }
401
+
402
+ if (mutedRef.current) {
403
+ // We still want to send audio to the server when we're muted to ensure we
404
+ // get any remaining audio back from the server, so let's pass an array length 1 with a value of 0
405
+ const mostlyEmptyInt16Array = new Int16Array(1);
406
+ socket.emit('incoming_audio', mostlyEmptyInt16Array);
407
+ } else {
408
+ const float32Audio = event.inputBuffer.getChannelData(0);
409
+ const pcm16Audio = float32To16BitPCM(float32Audio);
410
+ socket.emit('incoming_audio', pcm16Audio);
411
+ }
412
+
413
+ debug()?.sentAudio(event);
414
+ };
415
+
416
+ mediaStreamSource.connect(scriptProcessor);
417
+ scriptProcessor.connect(audioContext.destination);
418
+
419
+ bufferedSpeechPlayer.start();
420
+
421
+ try {
422
+ if (targetLang == null) {
423
+ throw new Error('[startStreaming] targetLang cannot be nullish');
424
+ }
425
+
426
+ // When we are starting the stream we want to pass all the dynamic config values
427
+ // available before actually configuring and starting the stream
428
+ const fullDynamicConfig: DynamicConfig = {
429
+ targetLanguage: targetLang,
430
+ };
431
+
432
+ await onSetDynamicConfig(fullDynamicConfig);
433
+
434
+ // NOTE: this needs to be the *audioContext* sample rate, not the sample rate of the input stream. Not entirely sure why.
435
+ await configureStreamAsync({
436
+ sampleRate: audioContext.sampleRate,
437
+ });
438
+ } catch (e) {
439
+ console.error('configureStreamAsync failed', e);
440
+ setStreamingStatus('stopped');
441
+ return;
442
+ }
443
+
444
+ setStreamingStatus('running');
445
+ };
446
+
447
+ const stopStreaming = useCallback(async () => {
448
+ if (streamingStatus === 'stopped') {
449
+ console.warn(
450
+ `Attempting to stop stream when status is ${streamingStatus}`,
451
+ );
452
+ return;
453
+ }
454
+
455
+ // Stop the speech playback right away
456
+ bufferedSpeechPlayer.stop();
457
+
458
+ if (inputStreamSource == null || scriptNodeProcessor == null) {
459
+ console.error(
460
+ 'inputStreamSource || scriptNodeProcessor is null in stopStreaming',
461
+ );
462
+ } else {
463
+ inputStreamSource.disconnect(scriptNodeProcessor);
464
+ scriptNodeProcessor.disconnect(audioContext.destination);
465
+
466
+ // Release the mic input so we stop showing the red recording icon in the browser
467
+ inputStream?.getTracks().forEach((track) => track.stop());
468
+ }
469
+
470
+ if (socket == null) {
471
+ console.warn('Unable to emit stop_stream because socket is null');
472
+ } else {
473
+ socket.emit('stop_stream', (result) => {
474
+ console.debug('[emit result: stop_stream]', result);
475
+ });
476
+ }
477
+
478
+ setStreamingStatus('stopped');
479
+ }, [
480
+ audioContext.destination,
481
+ bufferedSpeechPlayer,
482
+ inputStream,
483
+ inputStreamSource,
484
+ scriptNodeProcessor,
485
+ socket,
486
+ streamingStatus,
487
+ ]);
488
+
489
+ const onClearTranscriptForAll = useCallback(() => {
490
+ if (socket != null) {
491
+ socket.emit('clear_transcript_for_all');
492
+ }
493
+ }, [socket]);
494
+
495
+ /******************************************
496
+ * Effects
497
+ ******************************************/
498
+
499
+ useEffect(() => {
500
+ if (socket == null) {
501
+ return;
502
+ }
503
+
504
+ const onRoomStateUpdate = (roomState: RoomState) => {
505
+ setRoomState(roomState);
506
+ };
507
+
508
+ socket.on('room_state_update', onRoomStateUpdate);
509
+
510
+ return () => {
511
+ socket.off('room_state_update', onRoomStateUpdate);
512
+ };
513
+ }, [socket]);
514
+
515
+ useEffect(() => {
516
+ if (socket != null) {
517
+ const onTranslationText = (data: ServerTextData) => {
518
+ setReceivedData((prev) => [...prev, data]);
519
+ debug()?.receivedText(data.payload);
520
+ };
521
+
522
+ const onTranslationSpeech = (data: ServerSpeechData) => {
523
+ bufferedSpeechPlayer.addAudioToBuffer(data.payload, data.sample_rate);
524
+ };
525
+
526
+ socket.on('translation_text', onTranslationText);
527
+ socket.on('translation_speech', onTranslationSpeech);
528
+
529
+ return () => {
530
+ socket.off('translation_text', onTranslationText);
531
+ socket.off('translation_speech', onTranslationSpeech);
532
+ };
533
+ }
534
+ }, [bufferedSpeechPlayer, socket]);
535
+
536
+ useEffect(() => {
537
+ if (socket != null) {
538
+ const onServerStateUpdate = (newServerState: ServerState) => {
539
+ setServerState(newServerState);
540
+
541
+ // If a client creates a server lock, we want to stop streaming if we're not them
542
+ if (
543
+ newServerState.serverLock?.isActive === true &&
544
+ newServerState.serverLock?.clientID !== clientID &&
545
+ streamingStatus === 'running'
546
+ ) {
547
+ stopStreaming();
548
+ }
549
+
550
+ const firstAgentNullable = newServerState.agentsCapabilities[0];
551
+ if (agent == null && firstAgentNullable != null) {
552
+ setAgentAndUpdateParams(firstAgentNullable);
553
+ }
554
+ };
555
+
556
+ socket.on('server_state_update', onServerStateUpdate);
557
+
558
+ return () => {
559
+ socket.off('server_state_update', onServerStateUpdate);
560
+ };
561
+ }
562
+ }, [
563
+ agent,
564
+ clientID,
565
+ setAgentAndUpdateParams,
566
+ socket,
567
+ stopStreaming,
568
+ streamingStatus,
569
+ ]);
570
+
571
+ useEffect(() => {
572
+ if (socket != null) {
573
+ const onServerException = (
574
+ exceptionDataWithoutClientTime: ServerExceptionData,
575
+ ) => {
576
+ const exceptionData = {
577
+ ...exceptionDataWithoutClientTime,
578
+ timeStringClient: new Date(
579
+ exceptionDataWithoutClientTime['timeEpochMs'],
580
+ ).toLocaleString(),
581
+ };
582
+
583
+ setServerExceptions((prev) =>
584
+ [exceptionData, ...prev].slice(0, MAX_SERVER_EXCEPTIONS_TRACKED),
585
+ );
586
+ console.error(
587
+ `[server_exception] The server encountered an exception: ${exceptionData['message']}`,
588
+ exceptionData,
589
+ );
590
+ };
591
+
592
+ socket.on('server_exception', onServerException);
593
+
594
+ return () => {
595
+ socket.off('server_exception', onServerException);
596
+ };
597
+ }
598
+ }, [socket]);
599
+
600
+ useEffect(() => {
601
+ if (socket != null) {
602
+ const onClearTranscript = () => {
603
+ setReceivedData([]);
604
+ setTranslationSentencesAnimatedIndex(0);
605
+ };
606
+
607
+ socket.on('clear_transcript', onClearTranscript);
608
+
609
+ return () => {
610
+ socket.off('clear_transcript', onClearTranscript);
611
+ };
612
+ }
613
+ }, [socket]);
614
+
615
+ useEffect(() => {
616
+ const onScroll = () => {
617
+ if (isScrolledToDocumentBottom(SCROLLED_TO_BOTTOM_THRESHOLD_PX)) {
618
+ isScrolledToBottomRef.current = true;
619
+ return;
620
+ }
621
+ isScrolledToBottomRef.current = false;
622
+ return;
623
+ };
624
+
625
+ document.addEventListener('scroll', onScroll);
626
+
627
+ return () => {
628
+ document.removeEventListener('scroll', onScroll);
629
+ };
630
+ }, []);
631
+
632
+ useLayoutEffect(() => {
633
+ if (
634
+ lastTranslationResultRef.current != null &&
635
+ isScrolledToBottomRef.current
636
+ ) {
637
+ // Scroll the div to the most recent entry
638
+ lastTranslationResultRef.current.scrollIntoView();
639
+ }
640
+ // Run the effect every time data is received, so that
641
+ // we scroll to the bottom even if we're just adding text to
642
+ // a pre-existing chunk
643
+ }, [receivedData]);
644
+
645
+ useEffect(() => {
646
+ if (!animateTextDisplay) {
647
+ return;
648
+ }
649
+
650
+ if (
651
+ translationSentencesAnimatedIndex < translationSentencesBaseTotalLength
652
+ ) {
653
+ const timeout = setTimeout(() => {
654
+ setTranslationSentencesAnimatedIndex((prev) => prev + 1);
655
+ debug()?.startRenderText();
656
+ }, TYPING_ANIMATION_DELAY_MS);
657
+
658
+ return () => clearTimeout(timeout);
659
+ } else {
660
+ debug()?.endRenderText();
661
+ }
662
+ }, [
663
+ animateTextDisplay,
664
+ translationSentencesAnimatedIndex,
665
+ translationSentencesBaseTotalLength,
666
+ ]);
667
+
668
+ /******************************************
669
+ * Sub-components
670
+ ******************************************/
671
+
672
+ const volumeSliderNode = (
673
+ <Stack
674
+ spacing={2}
675
+ direction="row"
676
+ sx={{mb: 1, width: '100%'}}
677
+ alignItems="center">
678
+ <VolumeDown color="primary" />
679
+ <Slider
680
+ aria-label="Volume"
681
+ defaultValue={1}
682
+ scale={getGainScaledValue}
683
+ min={0}
684
+ max={3}
685
+ step={0.1}
686
+ marks={[
687
+ {value: 0, label: '0%'},
688
+ {value: 1, label: '100%'},
689
+ {value: 2, label: '400%'},
690
+ {value: 3, label: '700%'},
691
+ ]}
692
+ valueLabelFormat={(value) => `${(value * 100).toFixed(0)}%`}
693
+ valueLabelDisplay="auto"
694
+ value={gain}
695
+ onChange={(_event: Event, newValue: number | number[]) => {
696
+ if (typeof newValue === 'number') {
697
+ const scaledGain = getGainScaledValue(newValue);
698
+ // We want the actual gain node to use the scaled value
699
+ bufferedSpeechPlayer.setGain(scaledGain);
700
+ // But we want react state to keep track of the non-scaled value
701
+ setGain(newValue);
702
+ } else {
703
+ console.error(
704
+ `[volume slider] Unexpected non-number value: ${newValue}`,
705
+ );
706
+ }
707
+ }}
708
+ />
709
+ <VolumeUp color="primary" />
710
+ </Stack>
711
+ );
712
+
713
+ const xrDialogComponent = (
714
+ <XRDialog
715
+ animateTextDisplay={
716
+ animateTextDisplay &&
717
+ translationSentencesAnimatedIndex == translationSentencesBaseTotalLength
718
+ }
719
+ bufferedSpeechPlayer={bufferedSpeechPlayer}
720
+ translationSentences={translationSentences}
721
+ roomState={roomState}
722
+ roomID={roomID}
723
+ startStreaming={startStreaming}
724
+ stopStreaming={stopStreaming}
725
+ debugParam={debugParam}
726
+ onARHidden={() => {
727
+ setAnimateTextDisplay(urlParams.animateTextDisplay);
728
+ }}
729
+ onARVisible={() => setAnimateTextDisplay(false)}
730
+ />
731
+ );
732
+
733
+ return (
734
+ <div className="app-wrapper-sra">
735
+ <Box
736
+ // eslint-disable-next-line @typescript-eslint/ban-ts-comment
737
+ // @ts-ignore Not sure why it's complaining about complexity here
738
+ sx={{width: '100%', maxWidth: '660px', minWidth: '320px'}}>
739
+ <div className="main-container-sra">
740
+ <div className="top-section-sra horizontal-padding-sra">
741
+ <div className="header-container-sra">
742
+ <img
743
+ src={seamlessLogoUrl}
744
+ className="header-icon-sra"
745
+ alt="Seamless Translation Logo"
746
+ height={24}
747
+ width={24}
748
+ />
749
+
750
+ <div>
751
+ <Typography variant="h1" sx={{color: '#65676B'}}>
752
+ Seamless Translation
753
+ </Typography>
754
+ </div>
755
+ </div>
756
+ <div className="header-container-sra">
757
+ <div>
758
+ <Typography variant="body2" sx={{color: '#65676B'}}>
759
+ Welcome! Join a room as speaker or listener (or both), and share the
760
+ room code to invite listeners.
761
+ <br/>
762
+ SeamlessStreaming model is a research model and is not released
763
+ for production deployment. The streaming quality is closely
764
+ related to proper VAD segmentation. It works best if you pause
765
+ every couple of sentences, or you may wish adjust the VAD threshold
766
+ in the model config.
767
+ </Typography>
768
+ </div>
769
+ </div>
770
+ <Stack spacing="22px" direction="column">
771
+ <Box>
772
+ <RoomConfig
773
+ roomState={roomState}
774
+ serverState={serverState}
775
+ streamingStatus={streamingStatus}
776
+ onJoinRoomOrUpdateRoles={() => {
777
+ // If the user has switched from speaker to listener we need to tell the
778
+ // player to play eagerly, since currently the listener doesn't have any stop/start controls
779
+ bufferedSpeechPlayer.start();
780
+ }}
781
+ />
782
+
783
+ {isListener && !isSpeaker && (
784
+ <Box
785
+ sx={{
786
+ paddingX: 6,
787
+ paddingBottom: 2,
788
+ marginY: 2,
789
+ display: 'flex',
790
+ flexDirection: 'column',
791
+ alignItems: 'center',
792
+ }}>
793
+ {volumeSliderNode}
794
+ </Box>
795
+ )}
796
+ </Box>
797
+
798
+ {isSpeaker && (
799
+ <>
800
+ <Divider />
801
+
802
+ <Stack spacing="12px" direction="column">
803
+ <FormLabel id="output-modes-radio-group-label">
804
+ Model
805
+ </FormLabel>
806
+ <FormControl
807
+ disabled={
808
+ streamFixedConfigOptionsDisabled ||
809
+ agentsCapabilities.length === 0
810
+ }
811
+ fullWidth
812
+ sx={{minWidth: '14em'}}>
813
+ <InputLabel id="model-selector-input-label">
814
+ Model
815
+ </InputLabel>
816
+ <Select
817
+ labelId="model-selector-input-label"
818
+ label="Model"
819
+ onChange={(e: SelectChangeEvent) => {
820
+ const newAgent =
821
+ agentsCapabilities.find(
822
+ (agent) => e.target.value === agent.name,
823
+ ) ?? null;
824
+ if (newAgent == null) {
825
+ console.error(
826
+ 'Unable to find agent with name',
827
+ e.target.value,
828
+ );
829
+ }
830
+ setAgentAndUpdateParams(newAgent);
831
+ }}
832
+ value={model ?? ''}>
833
+ {agentsCapabilities.map((agent) => (
834
+ <MenuItem value={agent.name} key={agent.name}>
835
+ {agent.name}
836
+ </MenuItem>
837
+ ))}
838
+ </Select>
839
+ </FormControl>
840
+
841
+ </Stack>
842
+
843
+ <Stack spacing={0.5}>
844
+ <FormLabel id="output-modes-radio-group-label">
845
+ Output
846
+ </FormLabel>
847
+
848
+ <Box sx={{paddingTop: 2, paddingBottom: 1}}>
849
+ <FormControl fullWidth sx={{minWidth: '14em'}}>
850
+ <InputLabel id="target-selector-input-label">
851
+ Target Language
852
+ </InputLabel>
853
+ <Select
854
+ labelId="target-selector-input-label"
855
+ label="Target Language"
856
+ onChange={(e: SelectChangeEvent) => {
857
+ setTargetLang(e.target.value);
858
+ onSetDynamicConfig({
859
+ targetLanguage: e.target.value,
860
+ });
861
+ }}
862
+ value={targetLang ?? ''}>
863
+ {currentAgent?.targetLangs.map((langCode) => (
864
+ <MenuItem value={langCode} key={langCode}>
865
+ {getLanguageFromThreeLetterCode(langCode) != null
866
+ ? `${getLanguageFromThreeLetterCode(
867
+ langCode,
868
+ )} (${langCode})`
869
+ : langCode}
870
+ </MenuItem>
871
+ ))}
872
+ </Select>
873
+ </FormControl>
874
+ </Box>
875
+
876
+ <Grid container>
877
+ <Grid item xs={12} sm={4}>
878
+ <FormControl
879
+ disabled={streamFixedConfigOptionsDisabled}>
880
+ <RadioGroup
881
+ aria-labelledby="output-modes-radio-group-label"
882
+ value={outputMode}
883
+ onChange={(e) =>
884
+ setOutputMode(
885
+ e.target.value as SupportedOutputMode,
886
+ )
887
+ }
888
+ name="output-modes-radio-buttons-group">
889
+ {
890
+ // TODO: Use supported modalities from agentCapabilities
891
+ SUPPORTED_OUTPUT_MODES.map(({value, label}) => (
892
+ <FormControlLabel
893
+ key={value}
894
+ value={value}
895
+ control={<Radio />}
896
+ label={label}
897
+ />
898
+ ))
899
+ }
900
+ </RadioGroup>
901
+ </FormControl>
902
+ </Grid>
903
+
904
+ <Grid item xs={12} sm={8}>
905
+ <Stack
906
+ direction="column"
907
+ spacing={1}
908
+ alignItems="flex-start"
909
+ sx={{flexGrow: 1}}>
910
+ {isListener && (
911
+ <Box
912
+ sx={{
913
+ flexGrow: 1,
914
+ paddingX: 1.5,
915
+ paddingY: 1.5,
916
+ width: '100%',
917
+ }}>
918
+ {volumeSliderNode}
919
+ </Box>
920
+ )}
921
+ </Stack>
922
+ </Grid>
923
+ </Grid>
924
+ </Stack>
925
+
926
+ <Typography variant="body2" sx={{color: '#65676B'}}>
927
+ Note: we don't recommend echo cancellation, as it may distort
928
+ the input audio (dropping words/sentences) if there is output
929
+ audio playing. Instead, you should use headphones if you'd like
930
+ to listen to the output audio while speaking.
931
+ </Typography>
932
+
933
+ <Stack
934
+ direction="row"
935
+ spacing={2}
936
+ justifyContent="space-between">
937
+ <Box sx={{flex: 1}}>
938
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
939
+ <FormLabel id="input-source-radio-group-label">
940
+ Input Source
941
+ </FormLabel>
942
+ <RadioGroup
943
+ aria-labelledby="input-source-radio-group-label"
944
+ value={inputSource}
945
+ onChange={(e: React.ChangeEvent<HTMLInputElement>) =>
946
+ setInputSource(
947
+ e.target.value as SupportedInputSource,
948
+ )
949
+ }
950
+ name="input-source-radio-buttons-group">
951
+ {SUPPORTED_INPUT_SOURCES.map(({label, value}) => (
952
+ <FormControlLabel
953
+ key={value}
954
+ value={value}
955
+ control={<Radio />}
956
+ label={label}
957
+ />
958
+ ))}
959
+ </RadioGroup>
960
+ </FormControl>
961
+ </Box>
962
+ <Box sx={{flex: 1}}>
963
+ <FormControl disabled={streamFixedConfigOptionsDisabled}>
964
+ <FormLabel>Options</FormLabel>
965
+ <FormControlLabel
966
+ control={
967
+ <Checkbox
968
+ checked={
969
+ enableNoiseSuppression ??
970
+ AUDIO_STREAM_DEFAULTS[inputSource]
971
+ .noiseSuppression
972
+ }
973
+ onChange={(
974
+ event: React.ChangeEvent<HTMLInputElement>,
975
+ ) =>
976
+ setEnableNoiseSuppression(event.target.checked)
977
+ }
978
+ />
979
+ }
980
+ label="Noise Suppression (Browser)"
981
+ />
982
+ <FormControlLabel
983
+ control={
984
+ <Checkbox
985
+ checked={
986
+ enableEchoCancellation ??
987
+ AUDIO_STREAM_DEFAULTS[inputSource]
988
+ .echoCancellation
989
+ }
990
+ onChange={(
991
+ event: React.ChangeEvent<HTMLInputElement>,
992
+ ) =>
993
+ setEnableEchoCancellation(event.target.checked)
994
+ }
995
+ />
996
+ }
997
+ label="Echo Cancellation (Browser)"
998
+ />
999
+ <FormControlLabel
1000
+ control={
1001
+ <Checkbox
1002
+ checked={serverDebugFlag}
1003
+ onChange={(
1004
+ event: React.ChangeEvent<HTMLInputElement>,
1005
+ ) => setServerDebugFlag(event.target.checked)}
1006
+ />
1007
+ }
1008
+ label="Server Debug Flag"
1009
+ />
1010
+ </FormControl>
1011
+ </Box>
1012
+ </Stack>
1013
+
1014
+ <Stack direction="row" spacing={2}>
1015
+ {streamingStatus === 'stopped' ? (
1016
+ <Button
1017
+ variant="contained"
1018
+ onClick={startStreaming}
1019
+ disabled={
1020
+ roomID == null ||
1021
+ // Prevent users from starting streaming if there is a server lock with an active session
1022
+ (serverState?.serverLock?.isActive === true &&
1023
+ serverState.serverLock.clientID !== clientID)
1024
+ }>
1025
+ {buttonLabelMap[streamingStatus]}
1026
+ </Button>
1027
+ ) : (
1028
+ <Button
1029
+ variant="contained"
1030
+ color={
1031
+ streamingStatus === 'running' ? 'error' : 'primary'
1032
+ }
1033
+ disabled={
1034
+ streamingStatus === 'starting' || roomID == null
1035
+ }
1036
+ onClick={stopStreaming}>
1037
+ {buttonLabelMap[streamingStatus]}
1038
+ </Button>
1039
+ )}
1040
+
1041
+ <Box>
1042
+ <Button
1043
+ variant="contained"
1044
+ aria-label={muted ? 'Unmute' : 'Mute'}
1045
+ color={muted ? 'info' : 'primary'}
1046
+ onClick={() => setMuted((prev) => !prev)}
1047
+ sx={{
1048
+ borderRadius: 100,
1049
+ paddingX: 0,
1050
+ minWidth: '36px',
1051
+ }}>
1052
+ {muted ? <MicOff /> : <Mic />}
1053
+ </Button>
1054
+ </Box>
1055
+
1056
+ {roomID == null ? null : (
1057
+ <Box
1058
+ sx={{
1059
+ flexGrow: 1,
1060
+ display: 'flex',
1061
+ justifyContent: 'flex-end',
1062
+ }}>
1063
+ {xrDialogComponent}
1064
+ </Box>
1065
+ )}
1066
+ </Stack>
1067
+
1068
+ {serverExceptions.length > 0 && (
1069
+ <div>
1070
+ <Alert severity="error">
1071
+ {`The server encountered an exception. See the browser console for details. You may need to refresh the page to continue using the app.`}
1072
+ </Alert>
1073
+ </div>
1074
+ )}
1075
+
1076
+ {serverState != null &&
1077
+ serverState.totalActiveTranscoders >=
1078
+ TOTAL_ACTIVE_TRANSCODER_WARNING_THRESHOLD && (
1079
+ <div>
1080
+ <Alert severity="warning">
1081
+ {`The server currently has ${serverState?.totalActiveTranscoders} active streaming sessions. Performance may be degraded.`}
1082
+ </Alert>
1083
+ </div>
1084
+ )}
1085
+
1086
+ {serverState?.serverLock != null &&
1087
+ serverState.serverLock.clientID !== clientID && (
1088
+ <div>
1089
+ <Alert severity="warning">
1090
+ {`The server is currently locked by "${serverState.serverLock.name}". Priority will be given to that client when they are streaming, and your streaming session may be halted abruptly.`}
1091
+ </Alert>
1092
+ </div>
1093
+ )}
1094
+ </>
1095
+ )}
1096
+ </Stack>
1097
+
1098
+ {isListener && !isSpeaker && (
1099
+ <Box sx={{marginBottom: 1, marginTop: 2}}>
1100
+ {xrDialogComponent}
1101
+ </Box>
1102
+ )}
1103
+ </div>
1104
+
1105
+ {debugParam && roomID != null && <DebugSection />}
1106
+
1107
+ <div className="translation-text-container-sra horizontal-padding-sra">
1108
+ <Stack
1109
+ direction="row"
1110
+ spacing={2}
1111
+ sx={{mb: '16px', alignItems: 'center'}}>
1112
+ <Typography variant="h1" sx={{fontWeight: 700, flexGrow: 1}}>
1113
+ Transcript
1114
+ </Typography>
1115
+ {isSpeaker && (
1116
+ <Button
1117
+ variant="text"
1118
+ size="small"
1119
+ onClick={onClearTranscriptForAll}>
1120
+ Clear Transcript for All
1121
+ </Button>
1122
+ )}
1123
+ </Stack>
1124
+ <Stack direction="row">
1125
+ <div className="translation-text-sra">
1126
+ {translationSentencesWithEmptyStartingString.map(
1127
+ (sentence, index, arr) => {
1128
+ const isLast = index === arr.length - 1;
1129
+ const maybeRef = isLast
1130
+ ? {ref: lastTranslationResultRef}
1131
+ : {};
1132
+ return (
1133
+ <div className="text-chunk-sra" key={index} {...maybeRef}>
1134
+ <Typography variant="body1">
1135
+ {sentence}
1136
+ {animateTextDisplay && isLast && (
1137
+ <Blink
1138
+ intervalMs={CURSOR_BLINK_INTERVAL_MS}
1139
+ shouldBlink={
1140
+ (roomState?.activeTranscoders ?? 0) > 0
1141
+ }>
1142
+ <Typography
1143
+ component="span"
1144
+ variant="body1"
1145
+ sx={{
1146
+ display: 'inline-block',
1147
+ transform: 'scaleY(1.25) translateY(-1px)',
1148
+ }}>
1149
+ {'|'}
1150
+ </Typography>
1151
+ </Blink>
1152
+ )}
1153
+ </Typography>
1154
+ </div>
1155
+ );
1156
+ },
1157
+ )}
1158
+ </div>
1159
+ </Stack>
1160
+ </div>
1161
+ </div>
1162
+ </Box>
1163
+ </div>
1164
+ );
1165
+ }
streaming-react-app/src/URLParams.ts ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import { getBooleanParamFlag, getStringParamFlag } from './getParamFlag';
2
+ import { URLParamsObject } from './types/URLParamsTypes';
3
+
4
+ /**
5
+ * These are the URL parameters you can provide to the app to change its behavior.
6
+ *
7
+ * Boolean flags can be set by just providing the flag name (`?autoJoin`), or by
8
+ * explicitly setting it to 1 (true) or 0 (false): `?autoJoin=1` or `?autoJoin=0`
9
+ *
10
+ * String flags require an explicit value: `?roomID=ABCD`
11
+ *
12
+ * Examples:
13
+ *
14
+ * - `http://localhost:5173/?roomID=BBCD&autoJoin&debug`
15
+ * - `http://localhost:5173/?serverURL=localhost:8000`
16
+
17
+ * @returns
18
+ */
19
+
20
+ export function getURLParams(): URLParamsObject {
21
+ return {
22
+ // animate the translation text when it arrives, typing it out one letter at a time
23
+ animateTextDisplay: getBooleanParamFlag('animateTextDisplay', true), // default to true;
24
+
25
+ // automatically join the room when the app loads. requires roomID to be set via url param as well
26
+ autoJoin: getBooleanParamFlag('autoJoin', false),
27
+
28
+ // automatically check the server debug flag as true
29
+ debug: getBooleanParamFlag('debug', false),
30
+
31
+ // Enable UI on the client that allows locking out other users of the server when it's being used for high profile demos
32
+ // NOTE: There is an escape hatch for disabling a server lock by setting the name field to remove_server_lock
33
+ enableServerLock: getBooleanParamFlag('enableServerLock', false),
34
+
35
+ // Pre-populate the Room Code field with the provided roomID. Can be used in conjunction with autoJoin to jump straight into the room
36
+ roomID: getStringParamFlag('roomID'),
37
+
38
+ // Use an alternate server URL as the streaming server (useful for pointing to dev servers: http://localhost:5173/?serverURL=localhost:8000)
39
+ serverURL: getStringParamFlag('serverURL'),
40
+
41
+ // Skip the popup dialog that displays within VR, which is mostly redundant with the web based dialog
42
+ skipARIntro: getBooleanParamFlag('skipARIntro', true), // default to true
43
+
44
+ // Shows the translation text in AR in front of an opaque panel covering all the text area
45
+ // single_block = original single text block with background
46
+ // lines = each line is a separate block and animates
47
+ // lines_with_background = adds a panel behind lines
48
+ ARTranscriptionType: getStringParamFlag('ARTranscriptionType') || 'lines',
49
+ };
50
+ }
streaming-react-app/src/assets/Roboto-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/src/assets/Roboto-msdf.png ADDED
streaming-react-app/src/assets/RobotoMono-Regular-msdf.json ADDED
The diff for this file is too large to render. See raw diff
 
streaming-react-app/src/assets/RobotoMono-Regular.png ADDED
streaming-react-app/src/assets/seamless.svg ADDED
streaming-react-app/src/createBufferedSpeechPlayer.ts ADDED
@@ -0,0 +1,173 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import debug from './debug';
2
+
3
+ type AddAudioToBufferFunction = (
4
+ samples: Array<number>,
5
+ sampleRate: number,
6
+ ) => void;
7
+
8
+ export type BufferedSpeechPlayer = {
9
+ addAudioToBuffer: AddAudioToBufferFunction;
10
+ setGain: (gain: number) => void;
11
+ start: () => void;
12
+ stop: () => void;
13
+ };
14
+
15
+ type Options = {
16
+ onEnded?: () => void;
17
+ onStarted?: () => void;
18
+ };
19
+
20
+ export default function createBufferedSpeechPlayer({
21
+ onStarted,
22
+ onEnded,
23
+ }: Options): BufferedSpeechPlayer {
24
+ const audioContext = new AudioContext();
25
+ const gainNode = audioContext.createGain();
26
+ gainNode.connect(audioContext.destination);
27
+
28
+ let unplayedAudioBuffers: Array<AudioBuffer> = [];
29
+
30
+ let currentPlayingBufferSource: AudioBufferSourceNode | null = null;
31
+
32
+ let isPlaying = false;
33
+
34
+ // This means that the player starts in the 'stopped' state, and you need to call player.start() for it to start playing
35
+ let shouldPlayWhenAudioAvailable = false;
36
+
37
+ const setGain = (gain: number) => {
38
+ gainNode.gain.setValueAtTime(gain, audioContext.currentTime);
39
+ };
40
+
41
+ const start = () => {
42
+ shouldPlayWhenAudioAvailable = true;
43
+ debug()?.start();
44
+ playNextBufferIfNotAlreadyPlaying();
45
+ };
46
+
47
+ // Stop will stop the audio and clear the buffers
48
+ const stop = () => {
49
+ shouldPlayWhenAudioAvailable = false;
50
+
51
+ // Stop the current buffers
52
+ currentPlayingBufferSource?.stop();
53
+ currentPlayingBufferSource = null;
54
+
55
+ unplayedAudioBuffers = [];
56
+
57
+ onEnded != null && onEnded();
58
+ isPlaying = false;
59
+ return;
60
+ };
61
+
62
+ const playNextBufferIfNotAlreadyPlaying = () => {
63
+ if (!isPlaying) {
64
+ playNextBuffer();
65
+ }
66
+ };
67
+
68
+ const playNextBuffer = () => {
69
+ if (shouldPlayWhenAudioAvailable === false) {
70
+ console.debug(
71
+ '[BufferedSpeechPlayer][playNextBuffer] Not playing any more audio because shouldPlayWhenAudioAvailable is false.',
72
+ );
73
+ // NOTE: we do not need to set isPlaying = false or call onEnded because that will be handled in the stop() function
74
+ return;
75
+ }
76
+ if (unplayedAudioBuffers.length === 0) {
77
+ console.debug(
78
+ '[BufferedSpeechPlayer][playNextBuffer] No buffers to play.',
79
+ );
80
+ if (isPlaying) {
81
+ isPlaying = false;
82
+ onEnded != null && onEnded();
83
+ }
84
+ return;
85
+ }
86
+
87
+ // If isPlaying is false, then we are starting playback fresh rather than continuing it, and should call onStarted
88
+ if (isPlaying === false) {
89
+ isPlaying = true;
90
+ onStarted != null && onStarted();
91
+ }
92
+
93
+ const source = audioContext.createBufferSource();
94
+
95
+ // Get the first unplayed buffer from the array, and remove it from the array
96
+ const buffer = unplayedAudioBuffers.shift() ?? null;
97
+ source.buffer = buffer;
98
+ console.debug(
99
+ `[BufferedSpeechPlayer] Playing buffer with ${source.buffer?.length} samples`,
100
+ );
101
+
102
+ source.connect(gainNode);
103
+
104
+ const startTime = new Date().getTime();
105
+ source.start();
106
+ currentPlayingBufferSource = source;
107
+ // This is probably not necessary, but it doesn't hurt
108
+ isPlaying = true;
109
+
110
+ // TODO: consider changing this to a while loop to avoid deep recursion
111
+ const onThisBufferPlaybackEnded = () => {
112
+ console.debug(
113
+ `[BufferedSpeechPlayer] Buffer with ${source.buffer?.length} samples ended.`,
114
+ );
115
+ source.removeEventListener('ended', onThisBufferPlaybackEnded);
116
+ const endTime = new Date().getTime();
117
+ debug()?.playedAudio(startTime, endTime, buffer);
118
+ currentPlayingBufferSource = null;
119
+
120
+ // We don't set isPlaying = false here because we are attempting to continue playing. It will get set to false if there are no more buffers to play
121
+ playNextBuffer();
122
+ };
123
+
124
+ source.addEventListener('ended', onThisBufferPlaybackEnded);
125
+ };
126
+
127
+ const addAudioToBuffer: AddAudioToBufferFunction = (samples, sampleRate) => {
128
+ const incomingArrayBufferChunk = audioContext.createBuffer(
129
+ // 1 channel
130
+ 1,
131
+ samples.length,
132
+ sampleRate,
133
+ );
134
+
135
+ incomingArrayBufferChunk.copyToChannel(
136
+ new Float32Array(samples),
137
+ // first channel
138
+ 0,
139
+ );
140
+
141
+ console.debug(
142
+ `[addAudioToBufferAndPlay] Adding buffer with ${incomingArrayBufferChunk.length} samples to queue.`,
143
+ );
144
+
145
+ unplayedAudioBuffers.push(incomingArrayBufferChunk);
146
+ debug()?.receivedAudio(
147
+ incomingArrayBufferChunk.length / incomingArrayBufferChunk.sampleRate,
148
+ );
149
+ const audioBuffersTableInfo = unplayedAudioBuffers.map((buffer, i) => {
150
+ return {
151
+ index: i,
152
+ duration: buffer.length / buffer.sampleRate,
153
+ samples: buffer.length,
154
+ };
155
+ });
156
+ const totalUnplayedDuration = unplayedAudioBuffers.reduce((acc, buffer) => {
157
+ return acc + buffer.length / buffer.sampleRate;
158
+ }, 0);
159
+
160
+ console.debug(
161
+ `[addAudioToBufferAndPlay] Current state of incoming audio buffers (${totalUnplayedDuration.toFixed(
162
+ 1,
163
+ )}s unplayed):`,
164
+ );
165
+ console.table(audioBuffersTableInfo);
166
+
167
+ if (shouldPlayWhenAudioAvailable) {
168
+ playNextBufferIfNotAlreadyPlaying();
169
+ }
170
+ };
171
+
172
+ return {addAudioToBuffer, setGain, stop, start};
173
+ }
streaming-react-app/src/cursorBlinkInterval.ts ADDED
@@ -0,0 +1 @@
 
 
1
+ export const CURSOR_BLINK_INTERVAL_MS = 500;
streaming-react-app/src/debug.ts ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {TYPING_ANIMATION_DELAY_MS} from './StreamingInterface';
2
+ import {getURLParams} from './URLParams';
3
+ import audioBuffertoWav from 'audiobuffer-to-wav';
4
+ import './StreamingInterface.css';
5
+
6
+ type StartEndTime = {
7
+ start: number;
8
+ end: number;
9
+ };
10
+
11
+ type StartEndTimeWithAudio = StartEndTime & {
12
+ float32Audio: Float32Array;
13
+ };
14
+
15
+ type Text = {
16
+ time: number;
17
+ chars: number;
18
+ };
19
+
20
+ type DebugTimings = {
21
+ receivedAudio: StartEndTime[];
22
+ playedAudio: StartEndTimeWithAudio[];
23
+ receivedText: Text[];
24
+ renderedText: StartEndTime[];
25
+ sentAudio: StartEndTimeWithAudio[];
26
+ startRenderTextTime: number | null;
27
+ startRecordingTime: number | null;
28
+ receivedAudioSampleRate: number | null;
29
+ };
30
+
31
+ function getInitialTimings(): DebugTimings {
32
+ return {
33
+ receivedAudio: [],
34
+ playedAudio: [],
35
+ receivedText: [],
36
+ renderedText: [],
37
+ sentAudio: [],
38
+ startRenderTextTime: null,
39
+ startRecordingTime: null,
40
+ receivedAudioSampleRate: null,
41
+ };
42
+ }
43
+
44
+ function downloadAudioBuffer(audioBuffer: AudioBuffer, fileName: string): void {
45
+ const wav = audioBuffertoWav(audioBuffer);
46
+ const wavBlob = new Blob([new DataView(wav)], {
47
+ type: 'audio/wav',
48
+ });
49
+ const url = URL.createObjectURL(wavBlob);
50
+ const anchor = document.createElement('a');
51
+ anchor.href = url;
52
+ anchor.target = '_blank';
53
+ anchor.download = fileName;
54
+ anchor.click();
55
+ }
56
+
57
+ // Uncomment for debugging without download
58
+ // function playAudioBuffer(audioBuffer: AudioBuffer): void {
59
+ // const audioContext = new AudioContext();
60
+ // const source = audioContext.createBufferSource();
61
+
62
+ // source.buffer = audioBuffer;
63
+ // source.connect(audioContext.destination);
64
+ // source.start();
65
+ // }
66
+
67
+ // Accumulate timings and audio / text translation samples for debugging and exporting
68
+ class DebugTimingsManager {
69
+ timings: DebugTimings = getInitialTimings();
70
+
71
+ start(): void {
72
+ this.timings = getInitialTimings();
73
+ this.timings.startRecordingTime = new Date().getTime();
74
+ }
75
+
76
+ sentAudio(event: AudioProcessingEvent): void {
77
+ const end = new Date().getTime();
78
+ const start = end - event.inputBuffer.duration * 1000;
79
+ // Copy or else buffer seems to be re-used
80
+ const float32Audio = new Float32Array(event.inputBuffer.getChannelData(0));
81
+ this.timings.sentAudio.push({
82
+ start,
83
+ end,
84
+ float32Audio,
85
+ });
86
+ }
87
+
88
+ receivedText(text: string): void {
89
+ this.timings.receivedText.push({
90
+ time: new Date().getTime(),
91
+ chars: text.length,
92
+ });
93
+ }
94
+
95
+ startRenderText(): void {
96
+ if (this.timings.startRenderTextTime == null) {
97
+ this.timings.startRenderTextTime = new Date().getTime();
98
+ }
99
+ }
100
+
101
+ endRenderText(): void {
102
+ if (this.timings.startRenderTextTime == null) {
103
+ console.warn(
104
+ 'Wrong timings of start / end rendering text. startRenderText is null',
105
+ );
106
+ return;
107
+ }
108
+
109
+ this.timings.renderedText.push({
110
+ start: this.timings.startRenderTextTime as number,
111
+ end: new Date().getTime(),
112
+ });
113
+ this.timings.startRenderTextTime = null;
114
+ }
115
+
116
+ receivedAudio(duration: number): void {
117
+ const start = new Date().getTime();
118
+ this.timings.receivedAudio.push({
119
+ start,
120
+ end: start + duration * 1000,
121
+ });
122
+ }
123
+
124
+ playedAudio(start: number, end: number, buffer: AudioBuffer | null): void {
125
+ if (buffer != null) {
126
+ if (this.timings.receivedAudioSampleRate == null) {
127
+ this.timings.receivedAudioSampleRate = buffer.sampleRate;
128
+ }
129
+ if (this.timings.receivedAudioSampleRate != buffer.sampleRate) {
130
+ console.error(
131
+ 'Sample rates of received audio are unequal, will fail to reconstruct debug audio',
132
+ this.timings.receivedAudioSampleRate,
133
+ buffer.sampleRate,
134
+ );
135
+ }
136
+ }
137
+ this.timings.playedAudio.push({
138
+ start,
139
+ end,
140
+ float32Audio:
141
+ buffer == null
142
+ ? new Float32Array()
143
+ : new Float32Array(buffer.getChannelData(0)),
144
+ });
145
+ }
146
+
147
+ getChartData() {
148
+ const columns = [
149
+ {type: 'string', id: 'Series'},
150
+ {type: 'date', id: 'Start'},
151
+ {type: 'date', id: 'End'},
152
+ ];
153
+ return [
154
+ columns,
155
+ ...this.timings.sentAudio.map((sentAudio) => [
156
+ 'Sent Audio',
157
+ new Date(sentAudio.start),
158
+ new Date(sentAudio.end),
159
+ ]),
160
+ ...this.timings.receivedAudio.map((receivedAudio) => [
161
+ 'Received Audio',
162
+ new Date(receivedAudio.start),
163
+ new Date(receivedAudio.end),
164
+ ]),
165
+ ...this.timings.playedAudio.map((playedAudio) => [
166
+ 'Played Audio',
167
+ new Date(playedAudio.start),
168
+ new Date(playedAudio.end),
169
+ ]),
170
+ // Best estimate duration by multiplying length with animation duration for each letter
171
+ ...this.timings.receivedText.map((receivedText) => [
172
+ 'Received Text',
173
+ new Date(receivedText.time),
174
+ new Date(
175
+ receivedText.time + receivedText.chars * TYPING_ANIMATION_DELAY_MS,
176
+ ),
177
+ ]),
178
+ ...this.timings.renderedText.map((renderedText) => [
179
+ 'Rendered Text',
180
+ new Date(renderedText.start),
181
+ new Date(renderedText.end),
182
+ ]),
183
+ ];
184
+ }
185
+
186
+ downloadInputAudio() {
187
+ const audioContext = new AudioContext();
188
+ const totalLength = this.timings.sentAudio.reduce((acc, cur) => {
189
+ return acc + cur?.float32Audio?.length ?? 0;
190
+ }, 0);
191
+ if (totalLength === 0) {
192
+ return;
193
+ }
194
+
195
+ const incomingArrayBuffer = audioContext.createBuffer(
196
+ 1, // 1 channel
197
+ totalLength,
198
+ audioContext.sampleRate,
199
+ );
200
+
201
+ const buffer = incomingArrayBuffer.getChannelData(0);
202
+ let i = 0;
203
+ this.timings.sentAudio.forEach((sentAudio) => {
204
+ sentAudio.float32Audio.forEach((bytes) => {
205
+ buffer[i++] = bytes;
206
+ });
207
+ });
208
+
209
+ // Play for debugging
210
+ // playAudioBuffer(incomingArrayBuffer);
211
+ downloadAudioBuffer(incomingArrayBuffer, `input_audio.wav`);
212
+ }
213
+
214
+ downloadOutputAudio() {
215
+ const playedAudio = this.timings.playedAudio;
216
+ const sampleRate = this.timings.receivedAudioSampleRate;
217
+ if (
218
+ playedAudio.length === 0 ||
219
+ this.timings.startRecordingTime == null ||
220
+ sampleRate == null
221
+ ) {
222
+ return null;
223
+ }
224
+
225
+ let previousEndTime = this.timings.startRecordingTime;
226
+ const audioArray: number[] = [];
227
+ playedAudio.forEach((audio) => {
228
+ const delta = (audio.start - previousEndTime) / 1000;
229
+ for (let i = 0; i < delta * sampleRate; i++) {
230
+ audioArray.push(0.0);
231
+ }
232
+ audio.float32Audio.forEach((bytes) => audioArray.push(bytes));
233
+ previousEndTime = audio.end;
234
+ });
235
+ const audioContext = new AudioContext();
236
+ const incomingArrayBuffer = audioContext.createBuffer(
237
+ 1, // 1 channel
238
+ audioArray.length,
239
+ sampleRate,
240
+ );
241
+
242
+ incomingArrayBuffer.copyToChannel(
243
+ new Float32Array(audioArray),
244
+ 0, // first channel
245
+ );
246
+
247
+ // Play for debugging
248
+ // playAudioBuffer(incomingArrayBuffer);
249
+ downloadAudioBuffer(incomingArrayBuffer, 'output_audio.wav');
250
+ }
251
+ }
252
+
253
+ const debugSingleton = new DebugTimingsManager();
254
+ export default function debug(): DebugTimingsManager | null {
255
+ const debugParam = getURLParams().debug;
256
+ return debugParam ? debugSingleton : null;
257
+ }
streaming-react-app/src/float32To16BitPCM.ts ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function float32To16BitPCM(
2
+ float32Arr: Float32Array,
3
+ ): Int16Array {
4
+ const pcm16bit = new Int16Array(float32Arr.length);
5
+ for (let i = 0; i < float32Arr.length; ++i) {
6
+ // force number in [-1,1]
7
+ const s = Math.max(-1, Math.min(1, float32Arr[i]));
8
+
9
+ /**
10
+ * convert 32 bit float to 16 bit int pcm audio
11
+ * 0x8000 = minimum int16 value, 0x7fff = maximum int16 value
12
+ */
13
+ pcm16bit[i] = s < 0 ? s * 0x8000 : s * 0x7fff;
14
+ }
15
+ return pcm16bit;
16
+ }
streaming-react-app/src/generateNewRoomID.ts ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {random} from 'lodash';
2
+
3
+ // const USABLE_CHARACTERS = 'BCDFGHJKMPQRTVWXY2346789';
4
+ const USABLE_CHARACTERS = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';
5
+ const ID_LENGTH = 4;
6
+
7
+ export function isValidRoomID(id: string | null | undefined): boolean {
8
+ if (id == null) {
9
+ return false;
10
+ }
11
+ if (id.length !== ID_LENGTH) {
12
+ return false;
13
+ }
14
+ return isValidPartialRoomID(id);
15
+ }
16
+
17
+ export function isValidPartialRoomID(roomID: string): boolean {
18
+ return (
19
+ roomID.length <= ID_LENGTH &&
20
+ roomID.split('').every((char) => USABLE_CHARACTERS.includes(char))
21
+ );
22
+ }
23
+
24
+ export default function generateNewRoomID(): string {
25
+ return Array.from(
26
+ {length: ID_LENGTH},
27
+ () => USABLE_CHARACTERS[random(USABLE_CHARACTERS.length - 1)],
28
+ ).join('');
29
+ }
30
+
31
+ export function getSequentialRoomIDForTestingGenerator(): () => string {
32
+ let counter = 0;
33
+
34
+ return function generateNextRoomID(): string {
35
+ const counterInBase: string = Number(counter)
36
+ .toString(USABLE_CHARACTERS.length)
37
+ .padStart(ID_LENGTH, '0');
38
+
39
+ if (counterInBase.length > ID_LENGTH) {
40
+ throw new Error(
41
+ 'Ran out of unique room IDs from the sequential generator',
42
+ );
43
+ }
44
+
45
+ const result = counterInBase
46
+ .split('')
47
+ .map(
48
+ (digit) => USABLE_CHARACTERS[parseInt(digit, USABLE_CHARACTERS.length)],
49
+ )
50
+ .join('');
51
+
52
+ counter++;
53
+
54
+ return result;
55
+ };
56
+ }
streaming-react-app/src/getParamFlag.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import type {URLParamNames} from './types/URLParamsTypes';
2
+
3
+ export function getBooleanParamFlag(
4
+ flag: URLParamNames,
5
+ defaultValue?: boolean,
6
+ ): boolean {
7
+ const paramFlagValue = getBooleanParamFlagWithoutDefault(flag);
8
+
9
+ if (paramFlagValue == null) {
10
+ // The default value for paramFlags is false, unless they explicitly provide a
11
+ // defaultValue via the config
12
+ return defaultValue ?? false;
13
+ }
14
+
15
+ return paramFlagValue;
16
+ }
17
+
18
+ export function getBooleanParamFlagWithoutDefault(
19
+ flag: URLParamNames,
20
+ ): boolean | null {
21
+ const urlParams = new URLSearchParams(window.location.search);
22
+
23
+ if (urlParams.get(flag) == null) {
24
+ return null;
25
+ }
26
+
27
+ return urlParams.get(flag) !== '0';
28
+ }
29
+
30
+ export function getStringParamFlag(
31
+ flag: URLParamNames,
32
+ defaultValue?: string,
33
+ ): string | null {
34
+ const urlParams = new URLSearchParams(window.location.search);
35
+
36
+ const param = urlParams.get(flag);
37
+
38
+ return param ?? defaultValue ?? null;
39
+ }
streaming-react-app/src/getTranslationSentencesFromReceivedData.ts ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {ServerTextData, TranslationSentences} from './types/StreamingTypes';
2
+
3
+ export default function getTranslationSentencesFromReceivedData(
4
+ receivedData: Array<ServerTextData>,
5
+ ): TranslationSentences {
6
+ return receivedData
7
+ .reduce(
8
+ (acc, data) => {
9
+ const newAcc = [
10
+ ...acc.slice(0, -1),
11
+ acc[acc.length - 1].trim() + ' ' + data.payload,
12
+ ];
13
+ if (data.eos) {
14
+ newAcc.push('');
15
+ }
16
+
17
+ return newAcc;
18
+ },
19
+ [''],
20
+ )
21
+ .filter((s) => s.trim().length !== 0);
22
+ }
streaming-react-app/src/isScrolledToDocumentBottom.ts ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ export default function isScrolledToDocumentBottom(
2
+ bufferPx: number = 0,
3
+ ): boolean {
4
+ if (
5
+ window.innerHeight + window.scrollY >=
6
+ document.body.offsetHeight - bufferPx
7
+ ) {
8
+ return true;
9
+ }
10
+ return false;
11
+ }
streaming-react-app/src/languageLookup.ts ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ const LANG3_FULL = {
2
+ eng: 'english',
3
+ arb: 'arabic',
4
+ asm: 'assamese',
5
+ bel: 'belarusian',
6
+ bul: 'bulgarian',
7
+ ben: 'bengali',
8
+ cat: 'catalan',
9
+ ces: 'czech',
10
+ cym: 'welsh',
11
+ dan: 'danish',
12
+ deu: 'german',
13
+ ell: 'greek',
14
+ spa: 'spanish',
15
+ est: 'estonian',
16
+ pes: 'persian',
17
+ fin: 'finnish',
18
+ fra: 'french',
19
+ hin: 'hindi',
20
+ hun: 'hungarian',
21
+ ind: 'indonesian',
22
+ ita: 'italian',
23
+ jpn: 'japanese',
24
+ kat: 'georgian',
25
+ lit: 'lithuanian',
26
+ lvs: 'latvian',
27
+ khk: 'mongolian',
28
+ mar: 'marathi',
29
+ mlt: 'maltese',
30
+ nld: 'dutch',
31
+ pan: 'punjabi',
32
+ pol: 'polish',
33
+ por: 'portuguese',
34
+ ron: 'romanian',
35
+ rus: 'russian',
36
+ slk: 'slovak',
37
+ slv: 'slovenian',
38
+ swe: 'swedish',
39
+ swh: 'swahili',
40
+ tam: 'tamil',
41
+ tha: 'thai',
42
+ tur: 'turkish',
43
+ ukr: 'ukrainian',
44
+ urd: 'urdu',
45
+ uzn: 'uzbek',
46
+ vie: 'vietnamese',
47
+ cmn: 'chinese',
48
+ afr: 'afrikaans',
49
+ isl: 'icelandic',
50
+ ltz: 'luxembourgish',
51
+ nob: 'norwegian',
52
+ glg: 'galician',
53
+ bos: 'bosnian',
54
+ hrv: 'croatian',
55
+ mkd: 'macedonian',
56
+ srp: 'serbian',
57
+ hye: 'armenian',
58
+ azj: 'azerbaijani',
59
+ kaz: 'kazakh',
60
+ kor: 'korean',
61
+ guj: 'gujarati',
62
+ kan: 'kannada',
63
+ npi: 'nepali',
64
+ snd: 'sindhi',
65
+ tel: 'telugu',
66
+ jav: 'javanese',
67
+ zlm: 'malay',
68
+ mal: 'malayalam',
69
+ tgl: 'tagalog',
70
+ mya: 'myanmar',
71
+ khm: 'khmer',
72
+ lao: 'lao',
73
+ heb: 'hebrew',
74
+ pbt: 'pashto',
75
+ tgk: 'tajik',
76
+ amh: 'amharic',
77
+ lin: 'lingala',
78
+ som: 'somali',
79
+ yor: 'yoruba',
80
+ sna: 'shona',
81
+ mri: 'maori',
82
+ hau: 'hausa',
83
+ oci: 'occitan',
84
+ bak: 'bashkir',
85
+ bre: 'breton',
86
+ yid: 'yiddish',
87
+ hat: 'haitian creole',
88
+ mlg: 'malagasy',
89
+ sin: 'sinhala',
90
+ sqi: 'albanian',
91
+ sun: 'sundanese',
92
+ eus: 'basque',
93
+ nno: 'nynorsk',
94
+ tat: 'tatar',
95
+ bod: 'tibetan',
96
+ fao: 'faroese',
97
+ haw: 'hawaiian',
98
+ lat: 'latin',
99
+ san: 'sanskrit',
100
+ tuk: 'turkmen'
101
+ };
102
+
103
+ export function getLanguageFromThreeLetterCode(code: string): string | null {
104
+ try {
105
+ const name = LANG3_FULL[code] ?? null;
106
+ if (name == null) {
107
+ return null;
108
+ }
109
+ const capitalizedWords = name
110
+ .split(' ')
111
+ .map((word: string) => word[0].toUpperCase() + word.slice(1));
112
+ return capitalizedWords.join(' ');
113
+ } catch (e) {
114
+ console.warn(`Unable to get language name for code ${code}: ${e}`);
115
+ }
116
+ return null;
117
+ }
streaming-react-app/src/main.tsx ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import React from 'react';
2
+ import ReactDOM from 'react-dom/client';
3
+ import App from './App.tsx';
4
+
5
+ ReactDOM.createRoot(document.getElementById('root')!).render(
6
+ <React.StrictMode>
7
+ <App />
8
+ </React.StrictMode>,
9
+ );
streaming-react-app/src/react-xr/ARButton.tsx ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import * as THREE from 'three';
2
+ import {Button} from '@mui/material';
3
+ import {useCallback, useEffect, useState} from 'react';
4
+ import {BufferedSpeechPlayer} from '../createBufferedSpeechPlayer';
5
+
6
+ type Props = {
7
+ bufferedSpeechPlayer: BufferedSpeechPlayer;
8
+ renderer: THREE.WebGLRenderer | null;
9
+ onARVisible?: () => void;
10
+ onARHidden?: () => void;
11
+ };
12
+
13
+ export default function ARButton({
14
+ bufferedSpeechPlayer,
15
+ renderer,
16
+ onARVisible,
17
+ onARHidden,
18
+ }: Props) {
19
+ const [session, setSession] = useState<XRSession | null>(null);
20
+ const [supported, setSupported] = useState<boolean>(true);
21
+
22
+ useEffect(() => {
23
+ if (!navigator.xr) {
24
+ setSupported(false);
25
+ return;
26
+ }
27
+ navigator.xr.isSessionSupported('immersive-ar').then((supported) => {
28
+ setSupported(supported);
29
+ });
30
+ }, []);
31
+
32
+ const resetBuffers = useCallback(
33
+ (event: XRSessionEvent) => {
34
+ const session = event.target;
35
+ if (!(session instanceof XRSession)) {
36
+ return;
37
+ }
38
+ switch (session.visibilityState) {
39
+ case 'visible':
40
+ console.log('Restarting speech player, device is visible');
41
+ bufferedSpeechPlayer.stop();
42
+ bufferedSpeechPlayer.start();
43
+ onARVisible?.();
44
+ break;
45
+ case 'hidden':
46
+ console.log('Stopping speech player, device is hidden');
47
+ bufferedSpeechPlayer.stop();
48
+ bufferedSpeechPlayer.start();
49
+ onARHidden?.();
50
+ break;
51
+ }
52
+ },
53
+ [bufferedSpeechPlayer],
54
+ );
55
+
56
+ async function onSessionStarted(session: XRSession) {
57
+ setSession(session);
58
+
59
+ session.onvisibilitychange = resetBuffers;
60
+ session.onend = onSessionEnded;
61
+
62
+ await renderer.xr.setSession(session);
63
+ }
64
+
65
+ function onSessionEnded() {
66
+ setSession(null);
67
+ }
68
+
69
+ const onClick = () => {
70
+ if (session === null) {
71
+ navigator.xr!.requestSession('immersive-ar').then(onSessionStarted);
72
+ } else {
73
+ session.end();
74
+ }
75
+ };
76
+ return (
77
+ <Button
78
+ variant="contained"
79
+ onClick={onClick}
80
+ disabled={!supported || renderer == null}
81
+ sx={{mt: 1}}>
82
+ {supported
83
+ ? renderer != null
84
+ ? 'Enter AR'
85
+ : 'Initializing AR...'
86
+ : 'AR Not Supported'}
87
+ </Button>
88
+ );
89
+ }
streaming-react-app/src/react-xr/Button.tsx ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useRef, useEffect} from 'react';
2
+ import * as THREE from 'three';
3
+ import {extend} from '@react-three/fiber';
4
+ import ThreeMeshUI from 'three-mesh-ui';
5
+ import ThreeMeshUIText, {ThreeMeshUITextType} from './ThreeMeshUIText';
6
+ import {Interactive} from '@react-three/xr';
7
+
8
+ /**
9
+ * Using `?url` at the end of this import tells vite this is a static asset, and
10
+ * provides us a URL to the hashed version of the file when the project is built.
11
+ * See: https://vitejs.dev/guide/assets.html#explicit-url-imports
12
+ */
13
+ import robotoFontFamilyJson from '../assets/RobotoMono-Regular-msdf.json?url';
14
+ import robotoFontTexture from '../assets/RobotoMono-Regular.png';
15
+
16
+ extend(ThreeMeshUI);
17
+
18
+ /**
19
+ * Button component that renders as a three-mesh-ui block
20
+ */
21
+ export default function Button({
22
+ onClick,
23
+ content,
24
+ width,
25
+ height,
26
+ fontSize,
27
+ borderRadius,
28
+ padding,
29
+ }) {
30
+ const button = useRef<JSX.IntrinsicElements['block']>();
31
+ const textRef = useRef<ThreeMeshUITextType>();
32
+
33
+ useEffect(() => {
34
+ if (textRef.current != null) {
35
+ textRef.current.set({content});
36
+ }
37
+ }, [textRef, content]);
38
+
39
+ useEffect(() => {
40
+ if (!button.current) {
41
+ return;
42
+ }
43
+ button.current.setupState({
44
+ state: 'hovered',
45
+ attributes: {
46
+ offset: 0.002,
47
+ backgroundColor: new THREE.Color(0x607b8f),
48
+ fontColor: new THREE.Color(0xffffff),
49
+ },
50
+ });
51
+ button.current.setupState({
52
+ state: 'idle',
53
+ attributes: {
54
+ offset: 0.001,
55
+ backgroundColor: new THREE.Color(0x465a69),
56
+ fontColor: new THREE.Color(0xffffff),
57
+ },
58
+ });
59
+ button.current.setupState({
60
+ state: 'selected',
61
+ attributes: {
62
+ offset: 0.005,
63
+ backgroundColor: new THREE.Color(0x000000),
64
+ fontColor: new THREE.Color(0xffffff),
65
+ },
66
+ });
67
+ button.current.setState('idle');
68
+ }, []);
69
+
70
+ const args = [
71
+ {
72
+ width,
73
+ height,
74
+ fontSize,
75
+ padding,
76
+ justifyContent: 'end',
77
+ textAlign: 'center',
78
+ alignItems: 'center',
79
+ borderRadius,
80
+ fontFamily: robotoFontFamilyJson,
81
+ fontTexture: robotoFontTexture,
82
+ backgroundOpacity: 1,
83
+ backgroundColor: new THREE.Color(0x779092),
84
+ fontColor: new THREE.Color(0x000000),
85
+ },
86
+ ];
87
+
88
+ return (
89
+ <Interactive
90
+ // These are for XR mode
91
+ onSelect={() => {
92
+ onClick();
93
+ }}
94
+ onHover={() => button.current.setState('hovered')}
95
+ onBlur={() => button.current.setState('idle')}
96
+ onSelectStart={() => button.current.setState('selected')}
97
+ onSelectEnd={() => button.current.setState('idle')}>
98
+ <block
99
+ // These are for non-XR modes
100
+ onPointerEnter={() => button.current.setState('hovered')}
101
+ onPointerLeave={() => button.current.setState('idle')}
102
+ onPointerDown={() => button.current.setState('selected')}
103
+ onPointerUp={() => {
104
+ button.current.setState('hovered');
105
+ onClick();
106
+ }}>
107
+ <block args={args} ref={button}>
108
+ <ThreeMeshUIText
109
+ ref={textRef}
110
+ fontColor={new THREE.Color(0xffffff)}
111
+ content={content}
112
+ />
113
+ </block>
114
+ </block>
115
+ </Interactive>
116
+ );
117
+ }
streaming-react-app/src/react-xr/Colors.ts ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import * as THREE from 'three';
2
+
3
+ export const WHITE = new THREE.Color('#FFFFFF');
4
+ export const BLACK = new THREE.Color('#000000');
5
+ export const RED = new THREE.Color('red');
6
+ export const BLUE = new THREE.Color('blue');
streaming-react-app/src/react-xr/MovementController.tsx ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import {useRef} from 'react';
2
+ import {useFrame} from '@react-three/fiber';
3
+ import {useController, useXR} from '@react-three/xr';
4
+ import * as THREE from 'three';
5
+
6
+ const USE_HORIZONTAL = true;
7
+ const USE_VERTICAL = true;
8
+ const USE_ROTATION = true;
9
+ const HORIZONTAL_AXIS = 2;
10
+ const VERTICAL_AXIS = 3;
11
+ const ROTATION_AXIS = 2;
12
+ const SENSITIVITY = 0.05;
13
+ const DEADZONE = 0.05;
14
+
15
+ /**
16
+ * Component to add into the ThreeJS canvas that reads controller (Quest) inputs to change camera position
17
+ */
18
+ export default function MovementController() {
19
+ const xr = useXR();
20
+ const controller = useController('right');
21
+ const forward = useRef(new THREE.Vector3());
22
+ const horizontal = useRef(new THREE.Vector3());
23
+
24
+ useFrame(() => {
25
+ const player = xr.player;
26
+ const camera = xr.player.children[0];
27
+ const cameraMatrix = camera.matrixWorld.elements;
28
+ forward.current
29
+ .set(-cameraMatrix[8], -cameraMatrix[9], -cameraMatrix[10])
30
+ .normalize();
31
+
32
+ const axes = controller?.inputSource?.gamepad?.axes ?? [0, 0, 0, 0];
33
+
34
+ if (USE_HORIZONTAL) {
35
+ horizontal.current.copy(forward.current);
36
+ horizontal.current.cross(camera.up).normalize();
37
+
38
+ player.position.add(
39
+ horizontal.current.multiplyScalar(
40
+ (Math.abs(axes[HORIZONTAL_AXIS]) > DEADZONE
41
+ ? axes[HORIZONTAL_AXIS]
42
+ : 0) * SENSITIVITY,
43
+ ),
44
+ );
45
+ }
46
+
47
+ if (USE_VERTICAL) {
48
+ player.position.add(
49
+ forward.current.multiplyScalar(
50
+ (Math.abs(axes[VERTICAL_AXIS]) > DEADZONE ? axes[VERTICAL_AXIS] : 0) *
51
+ SENSITIVITY,
52
+ ),
53
+ );
54
+ }
55
+
56
+ if (USE_ROTATION) {
57
+ player.rotation.y -=
58
+ (Math.abs(axes[ROTATION_AXIS]) > DEADZONE ? axes[ROTATION_AXIS] : 0) *
59
+ SENSITIVITY;
60
+ }
61
+ });
62
+
63
+ return <></>;
64
+ }
streaming-react-app/src/react-xr/Playground.tsx ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * EXPERIMENTAL components to play around with but not officially use in the demo while
3
+ * we develop.
4
+ */
5
+ import {useEffect, useState} from 'react';
6
+ import {Object3DNode, extend} from '@react-three/fiber';
7
+ import ThreeMeshUI from 'three-mesh-ui';
8
+
9
+ import {} from '@react-three/xr';
10
+ import {Sparkles, Shadow} from '@react-three/drei';
11
+
12
+ // import FontImage from './assets/Roboto-msdf.png';
13
+ import {FontLoader} from 'three/examples/jsm/loaders/FontLoader.js';
14
+ import {TextGeometry} from 'three/examples/jsm/geometries/TextGeometry.js';
15
+ import ThreeMeshUIText from './ThreeMeshUIText';
16
+ import {ContactShadows, BakeShadows} from '@react-three/drei';
17
+
18
+ extend({TextGeometry});
19
+ extend(ThreeMeshUI);
20
+
21
+ declare module '@react-three/fiber' {
22
+ interface ThreeElements {
23
+ textGeometry: Object3DNode<TextGeometry, typeof TextGeometry>;
24
+ }
25
+ }
26
+
27
+ // This is for textGeometry.. not using three-mesh-ui to display text
28
+ export function TitleMesh() {
29
+ const font = new FontLoader().parse();
30
+ console.log('font', font);
31
+ const [text, setText] = useState('Text');
32
+
33
+ useEffect(() => {
34
+ setTimeout(() => {
35
+ setText(text + ' more ');
36
+ console.log('adding more tex..', text);
37
+ }, 1000);
38
+ }, [text]);
39
+
40
+ return (
41
+ <mesh>
42
+ <textGeometry args={[text, {font, size: 5, height: 1}]} />
43
+ <meshPhysicalMaterial attach={'material'} color={'white'} />
44
+ </mesh>
45
+ );
46
+ }
47
+
48
+ export function Sphere({
49
+ size = 1,
50
+ amount = 50,
51
+ color = 'white',
52
+ emissive,
53
+ ...props
54
+ }) {
55
+ return (
56
+ <mesh {...props}>
57
+ <sphereGeometry args={[size, 64, 64]} />
58
+ <meshPhysicalMaterial
59
+ roughness={0}
60
+ color={color}
61
+ emissive={emissive || color}
62
+ envMapIntensity={0.2}
63
+ />
64
+ <Sparkles count={amount} scale={size * 2} size={6} speed={0.4} />
65
+ <Shadow
66
+ rotation={[-Math.PI / 2, 0, 0]}
67
+ scale={size}
68
+ position={[0, -size, 0]}
69
+ color={emissive}
70
+ opacity={0.5}
71
+ />
72
+ </mesh>
73
+ );
74
+ }
75
+
76
+ export function Title({accentColor}) {
77
+ return (
78
+ <block
79
+ args={[
80
+ {
81
+ width: 1,
82
+ height: 0.25,
83
+ backgroundOpacity: 0,
84
+ justifyContent: 'center',
85
+ },
86
+ ]}>
87
+ <ThreeMeshUIText content={'Hello '} />
88
+ <ThreeMeshUIText content={'world!'} args={[{fontColor: accentColor}]} />
89
+ </block>
90
+ );
91
+ }
92
+
93
+ export function RandomComponents() {
94
+ return (
95
+ <>
96
+ <color args={['#eee']} attach={'background'} />
97
+ <Sphere
98
+ color="white"
99
+ amount={50}
100
+ emissive="green"
101
+ glow="lightgreen"
102
+ position={[1, 1, -1]}
103
+ />
104
+ <Sphere
105
+ color="white"
106
+ amount={30}
107
+ emissive="purple"
108
+ glow="#ff90f0"
109
+ size={0.5}
110
+ position={[-1.5, 0.5, -2]}
111
+ />
112
+ <Sphere
113
+ color="lightpink"
114
+ amount={20}
115
+ emissive="orange"
116
+ glow="#ff9f50"
117
+ size={0.25}
118
+ position={[-1, 0.25, 1]}
119
+ />
120
+ <ContactShadows
121
+ renderOrder={2}
122
+ color="black"
123
+ resolution={1024}
124
+ frames={1}
125
+ scale={10}
126
+ blur={1.5}
127
+ opacity={0.65}
128
+ far={0.5}
129
+ />
130
+ <BakeShadows />
131
+ </>
132
+ );
133
+ }