thecollabagepatch commited on
Commit
7ac8db1
·
1 Parent(s): c1e9a88

update docs

Browse files
Files changed (1) hide show
  1. app.py +240 -192
app.py CHANGED
@@ -116,200 +116,248 @@ _patch_t5x_for_gpu_coords()
116
 
117
  def create_documentation_interface():
118
  """Create a Gradio interface for documentation and transparency"""
119
-
120
  with gr.Blocks(title="MagentaRT Research API", theme=gr.themes.Soft()) as interface:
121
-
122
- gr.Markdown("""
123
- # 🎵 MagentaRT Live Music Generation Research API
124
-
125
- **Research-only implementation for iOS app development**
126
-
127
- This API uses Google's [MagentaRT](https://github.com/magenta/magenta-realtime) to generate
128
- continuous music based on input audio loops for experimental iOS app development.
129
- """)
130
-
 
131
  with gr.Tabs():
132
- with gr.Tab("📖 About This Research"):
133
- gr.Markdown("""
134
- ## What This API Does
135
-
136
- We're exploring AI-assisted loop-based music creation for mobile apps. Websockets are notoriously annoying in ios-swift apps, so I tried to come up with an http version tailored to the loop based nature of an existing swift app. This API provides:
137
-
138
- ### 🎹 Single Generation (`/generate`)
139
- - Upload audio loop + BPM + style parameters
140
- - Returns 4-8 bars of AI-generated continuation
141
- - **Performance**: 4 bars in ~9s, 8 bars in ~16s (L40S GPU)
142
-
143
- ### 🔄 Continuous Jamming (`/jam/*`)
144
- - `/jam/start` - Begin continuous generation session
145
- - `/jam/next` - Get next bar-aligned chunk
146
- - `/jam/stop` - End session
147
- - **Performance**: Real-time 8-bar chunks after warmup
148
-
149
- ## Technical Specs
150
- - **Model**: MagentaRT (800M parameter transformer)
151
- - **Quality**: 48kHz stereo output
152
- - **Context**: 10-second audio analysis window
153
- - **Styles**: Text descriptions (e.g., "acid house, techno")
154
-
155
- ## Research Goals
156
- - Seamless AI music generation for loop-based composition
157
- - Real-time parameter adjustment during generation
158
- - Mobile-optimized music creation workflows
159
- """)
160
-
161
- with gr.Tab("🔧 API Documentation"):
162
- gr.Markdown("""
163
- ## Single Generation Example
164
- ```bash
165
- curl -X POST "/generate" \\
166
- -F "loop_audio=@drum_loop.wav" \\
167
- -F "bpm=120" \\
168
- -F "bars=8" \\
169
- -F "styles=acid house,techno" \\
170
- -F "guidance_weight=5.0" \\
171
- -F "temperature=1.1"
172
- ```
173
-
174
- ## Continuous Jamming Example
175
- ```bash
176
- # 1. Start session
177
- SESSION=$(curl -X POST "/jam/start" \\
178
- -F "loop_audio=@loop.wav" \\
179
- -F "bpm=120" \\
180
- -F "bars_per_chunk=8" | jq -r .session_id)
181
-
182
- # 2. Get chunks in real-time
183
- curl "/jam/next?session_id=$SESSION"
184
-
185
- # 3. Stop when done
186
- curl -X POST "/jam/stop" \\
187
- -H "Content-Type: application/json" \\
188
- -d "{\\"session_id\\": \\"$SESSION\\"}"
189
- ```
190
-
191
- ## Key Parameters
192
- - **bpm**: 60-200 (beats per minute)
193
- - **bars**: 1-16 (bars to generate)
194
- - **styles**: Text descriptions, comma-separated
195
- - **guidance_weight**: 0.1-10.0 (style adherence)
196
- - **temperature**: 0.1-2.0 (randomness)
197
- - **intro_bars_to_drop**: Skip N bars from start
198
-
199
- ## Response Format
200
- ```json
201
- {
202
- "audio_base64": "...",
203
- "metadata": {
204
- "bpm": 120,
205
- "bars": 8,
206
- "sample_rate": 48000,
207
- "loop_duration_seconds": 16.0
208
- }
209
- }
210
- ```
211
- """)
212
-
213
- with gr.Tab("📱 iOS App Integration"):
214
- gr.Markdown("""
215
- ## How Our iOS App Uses This API
216
-
217
- ### User Flow
218
- 1. **Record/Import**: User provides drum or instrument loop
219
- 2. **Parameter Setup**: Set BPM, style, generation settings
220
- 3. **Continuous Generation**: App calls `/jam/start`
221
- 4. **Real-time Playback**: App fetches chunks via `/jam/next`
222
- 5. **Seamless Mixing**: Generated audio mixed into live stream
223
-
224
- ### Technical Implementation
225
- - **Audio Format**: 48kHz WAV for consistency
226
- - **Chunk Size**: 8 bars (~16 seconds at 120 BPM)
227
- - **Buffer Management**: 3-5 chunks ahead for smooth playback
228
- - **Style Updates**: Real-time parameter adjustment via `/jam/update`
229
-
230
- ### Networking Considerations
231
- - **Latency**: ~2-3 seconds per chunk after warmup
232
- - **Bandwidth**: ~500KB per 8-bar chunk (compressed)
233
- - **Reliability**: Automatic retry with exponential backoff
234
- - **Caching**: Local buffer for offline resilience
235
- """)
236
-
237
- with gr.Tab("⚖️ Licensing & Legal"):
238
- gr.Markdown("""
239
- ## MagentaRT Licensing
240
-
241
- This project uses Google's MagentaRT model under:
242
- - **Source Code**: Apache License 2.0
243
- - **Model Weights**: Creative Commons Attribution 4.0 International
244
- - **Usage Terms**: [See MagentaRT repository](https://github.com/magenta/magenta-realtime)
245
-
246
- ### Key Requirements
247
- - ✅ **Attribution**: Credit MagentaRT in derivative works
248
- - **Responsible Use**: Don't infringe copyrights
249
- - **No Warranties**: Use at your own risk
250
- - ✅ **Patent License**: Explicit patent grants included
251
-
252
- ## Our Implementation
253
- - **Purpose**: Research and development only
254
- - **Non-Commercial**: Experimental iOS app development
255
- - **Open Source**: Will release implementation under Apache 2.0
256
- - **Attribution**: Proper credit to Google Research team
257
-
258
- ### Required Attribution
259
- ```
260
- Generated using MagentaRT
261
- Copyright 2024 Google LLC
262
- Licensed under Apache 2.0 and CC-BY 4.0
263
- Implementation for research purposes
264
- ```
265
- """)
266
-
267
- with gr.Tab("📊 Performance & Limits"):
268
- gr.Markdown("""
269
- ## Current Performance (L40S 48GB)
270
-
271
- ### ⚡ Single Generation
272
- - **4 bars @ 100 BPM**: ~9 seconds
273
- - **8 bars @ 100 BPM**: ~16 seconds
274
- - **Memory usage**: ~40GB VRAM during generation
275
-
276
- ### 🔄 Continuous Jamming
277
- - **Warmup**: ~10-15 seconds first chunk
278
- - **8-bar chunks @ 120 BPM**: Real-time delivery
279
- - **Buffer ahead**: 3-5 chunks for smooth playback
280
-
281
- ## Known Limitations
282
-
283
- ### 🎵 Model Limitations (MagentaRT)
284
- - **Context**: 10-second maximum memory
285
- - **Training**: Primarily Western instrumental music
286
- - **Vocals**: Non-lexical only, no lyric conditioning
287
- - **Structure**: No long-form song arrangement
288
- - **Inside Swift**: After a few turns of continuous chunks, the swift app works best if you restart the jam from the combined audio again. In this way you might end up with a real jam.
289
-
290
- ### 🖥️ Infrastructure Limitations
291
- - **Concurrency**: Single user jam sessions only
292
- - **GPU Memory**: 40GB+ VRAM required for stable operation
293
- - **Latency**: 2+ second minimum for style changes
294
- - **Uptime**: Research setup, no SLA guarantees
295
-
296
- ## Resource Requirements
297
- - **Minimum**: 24GB VRAM (basic operation, won't operate realtime enough for new chunks coming in)
298
- - **Recommended**: 48GB VRAM (stable performance)
299
- - **CPU**: 8+ cores
300
- - **System RAM**: 32GB+
301
- - **Storage**: 50GB+ for model weights
302
- """)
303
-
304
- gr.Markdown("""
305
- ---
306
-
307
- **🔬 Research Project** | **📱 iOS Development** | **🎵 Powered by MagentaRT**
308
-
309
- This API is part of ongoing research into AI-assisted music creation for mobile devices.
310
- For technical details, see the API documentation tabs above.
311
- """)
312
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
313
  return interface
314
 
315
  jam_registry: dict[str, JamWorker] = {}
 
116
 
117
  def create_documentation_interface():
118
  """Create a Gradio interface for documentation and transparency"""
 
119
  with gr.Blocks(title="MagentaRT Research API", theme=gr.themes.Soft()) as interface:
120
+ gr.Markdown(
121
+ r"""
122
+ # 🎵 MagentaRT Live Music Generation Research API
123
+
124
+ **Research-only implementation for iOS/web app development**
125
+
126
+ This API uses Google's [MagentaRT](https://github.com/magenta/magenta-realtime) to generate
127
+ continuous music either as **bar-aligned chunks over HTTP** or as **low-latency realtime chunks via WebSocket**.
128
+ """
129
+ )
130
+
131
  with gr.Tabs():
132
+ # ------------------------------------------------------------------
133
+ # About & current status
134
+ # ------------------------------------------------------------------
135
+ with gr.Tab("📖 About & Status"):
136
+ gr.Markdown(
137
+ r"""
138
+ ## What this is
139
+ We're exploring AI‑assisted loop‑based music creation that can run on GPUs (not just TPUs) and stream to apps in realtime.
140
+
141
+ ### Implemented backends
142
+ - **HTTP (bar‑aligned):** `/generate`, `/jam/start`, `/jam/next`, `/jam/stop`, `/jam/update`, etc.
143
+ - **WebSocket (realtime):** `ws://…/ws/jam` with `mode="rt"` (Colab‑style continuous chunks). New in this build.
144
+
145
+ ## What we learned (GPU notes)
146
+ - **L40S 48GB:** comfortably **faster than realtime** → we added a `pace: "realtime"` switch so the server doesn’t outrun playback.
147
+ - **L4 24GB:** **consistently just under realtime**; even with pre‑roll buffering, TF32/JAX tunings, reduced chunk size, and the **base** checkpoint, we still see eventual under‑runs.
148
+ - **Implication:** For production‑quality realtime, aim for ~**40GB VRAM** per user/session (e.g., **A100 40GB**, or MIG slices ≈ **35–40GB** on newer parts). Smaller GPUs can demo, but sustained realtime is not reliable.
149
+
150
+ ## Model / audio specs
151
+ - **Model:** MagentaRT (T5X; decoder RVQ depth = 16)
152
+ - **Audio:** 48 kHz stereo, 2.0 s chunks by default, 40 ms crossfade
153
+ - **Context:** 10 s rolling context window
154
+ """
155
+ )
156
+
157
+ # ------------------------------------------------------------------
158
+ # HTTP API
159
+ # ------------------------------------------------------------------
160
+ with gr.Tab("🔧 API (HTTP)"):
161
+ gr.Markdown(
162
+ r"""
163
+ ### Single Generation
164
+ ```bash
165
+ curl -X POST \
166
+ "$HOST/generate" \
167
+ -F "loop_audio=@drum_loop.wav" \
168
+ -F "bpm=120" \
169
+ -F "bars=8" \
170
+ -F "styles=acid house,techno" \
171
+ -F "guidance_weight=5.0" \
172
+ -F "temperature=1.1"
173
+ ```
174
+
175
+ ### Continuous Jamming (bar‑aligned, HTTP)
176
+ ```bash
177
+ # 1) Start a session
178
+ echo $(curl -s -X POST "$HOST/jam/start" \
179
+ -F "loop_audio=@loop.wav" \
180
+ -F "bpm=120" \
181
+ -F "bars_per_chunk=8") | jq .
182
+ # {"session_id":"…"}
183
+
184
+ # 2) Pull next chunk (repeat)
185
+ curl "$HOST/jam/next?session_id=$SESSION"
186
+
187
+ # 3) Stop
188
+ curl -X POST "$HOST/jam/stop" \
189
+ -H "Content-Type: application/json" \
190
+ -d '{"session_id":"'$SESSION'"}'
191
+ ```
192
+
193
+ ### Common parameters
194
+ - **bpm** *(int)* beats per minute
195
+ - **bars / bars_per_chunk** *(int)* – musical length
196
+ - **styles** *(str)* – comma‑separated text prompts (mixed internally)
197
+ - **guidance_weight** *(float)* style adherence (CFG weight)
198
+ - **temperature / topk** – sampling controls
199
+ - **intro_bars_to_drop** *(int, /generate)* – generate-and-trim intro
200
+ """
201
+ )
202
+
203
+ # ------------------------------------------------------------------
204
+ # WebSocket API: realtime (‘rt’ mode)
205
+ # ------------------------------------------------------------------
206
+ with gr.Tab("🧩 API (WebSocket • rt mode)"):
207
+ gr.Markdown(
208
+ r"""
209
+ Connect to `wss://…/ws/jam` and send a **JSON control stream**. In `rt` mode the server emits ~2 s WAV chunks (or binary frames) continuously.
210
+
211
+ ### Start (client → server)
212
+ ```jsonc
213
+ {
214
+ "type": "start",
215
+ "mode": "rt",
216
+ "binary_audio": false, // true → raw WAV bytes + separate chunk_meta
217
+ "params": {
218
+ "styles": "heavy metal", // or "jazz, hiphop"
219
+ "style_weights": "1.0,1.0", // optional, auto‑normalized
220
+ "temperature": 1.1,
221
+ "topk": 40,
222
+ "guidance_weight": 1.1,
223
+ "pace": "realtime", // "realtime" | "asap" (default)
224
+ "max_decode_frames": 50 // 50≈2.0s; try 36–45 on smaller GPUs
225
+ }
226
+ }
227
+ ```
228
+
229
+ ### Server events (server → client)
230
+ - `{"type":"started","mode":"rt"}` – handshake
231
+ - `{"type":"chunk","audio_base64":"…","metadata":{…}}` base64 WAV
232
+ - `metadata.sample_rate` *(int)* usually 48000
233
+ - `metadata.chunk_frames` *(int)* e.g., 50
234
+ - `metadata.chunk_seconds` *(float)* frames / 25.0
235
+ - `metadata.crossfade_seconds` *(float)* – typically 0.04
236
+ - `{"type":"chunk_meta","metadata":{…}}` – sent **after** a binary frame when `binary_audio=true`
237
+ - `{"type":"status",…}`, `{"type":"error",…}`, `{"type":"stopped"}`
238
+
239
+ ### Update (client → server)
240
+ ```jsonc
241
+ {
242
+ "type": "update",
243
+ "styles": "jazz, hiphop",
244
+ "style_weights": "1.0,0.8",
245
+ "temperature": 1.2,
246
+ "topk": 64,
247
+ "guidance_weight": 1.0,
248
+ "pace": "realtime", // optional live flip
249
+ "max_decode_frames": 40 // optional; <= 50
250
+ }
251
+ ```
252
+
253
+ ### Stop / ping
254
+ ```json
255
+ {"type":"stop"}
256
+ {"type":"ping"}
257
+ ```
258
+
259
+ ### Browser quick‑start (schedules seamlessly with 25–40 ms crossfade)
260
+ ```html
261
+ <script>
262
+ const XFADE = 0.025; // 25 ms
263
+ let ctx, gain, ws, nextTime = 0;
264
+ async function start(){
265
+ ctx = new (window.AudioContext||window.webkitAudioContext)();
266
+ gain = ctx.createGain(); gain.connect(ctx.destination);
267
+ ws = new WebSocket("wss://YOUR_SPACE/ws/jam");
268
+ ws.onopen = ()=> ws.send(JSON.stringify({
269
+ type:"start", mode:"rt", binary_audio:false,
270
+ params:{ styles:"warmup", temperature:1.1, topk:40, guidance_weight:1.1, pace:"realtime" }
271
+ }));
272
+ ws.onmessage = async ev => {
273
+ const msg = JSON.parse(ev.data);
274
+ if (msg.type === "chunk" && msg.audio_base64){
275
+ const bin = atob(msg.audio_base64); const buf = new Uint8Array(bin.length);
276
+ for (let i=0;i<bin.length;i++) buf[i] = bin.charCodeAt(i);
277
+ const ab = buf.buffer; const audio = await ctx.decodeAudioData(ab);
278
+ const src = ctx.createBufferSource(); const g = ctx.createGain();
279
+ src.buffer = audio; src.connect(g); g.connect(gain);
280
+ if (nextTime < ctx.currentTime + 0.05) nextTime = ctx.currentTime + 0.12;
281
+ const startAt = nextTime, dur = audio.duration;
282
+ nextTime = startAt + Math.max(0, dur - XFADE);
283
+ g.gain.setValueAtTime(0, startAt);
284
+ g.gain.linearRampToValueAtTime(1, startAt + XFADE);
285
+ g.gain.setValueAtTime(1, startAt + Math.max(0, dur - XFADE));
286
+ g.gain.linearRampToValueAtTime(0, startAt + dur);
287
+ src.start(startAt);
288
+ }
289
+ };
290
+ }
291
+ </script>
292
+ ```
293
+
294
+ ### Python client (async)
295
+ ```python
296
+ import asyncio, json, websockets, base64, soundfile as sf, io
297
+ async def run(url):
298
+ async with websockets.connect(url) as ws:
299
+ await ws.send(json.dumps({"type":"start","mode":"rt","binary_audio":False,
300
+ "params": {"styles":"warmup","temperature":1.1,"topk":40,"guidance_weight":1.1,"pace":"realtime"}}))
301
+ while True:
302
+ msg = json.loads(await ws.recv())
303
+ if msg.get("type") == "chunk":
304
+ wav = base64.b64decode(msg["audio_base64"]) # bytes of a WAV
305
+ x, sr = sf.read(io.BytesIO(wav), dtype="float32")
306
+ print("chunk", x.shape, sr)
307
+ elif msg.get("type") in ("stopped","error"): break
308
+ asyncio.run(run("wss://YOUR_SPACE/ws/jam"))
309
+ ```
310
+ """
311
+ )
312
+
313
+ # ------------------------------------------------------------------
314
+ # Performance & hardware guidance
315
+ # ------------------------------------------------------------------
316
+ with gr.Tab("📊 Performance & Hardware"):
317
+ gr.Markdown(
318
+ r"""
319
+ ### Current observations
320
+ - **L40S 48GB** → faster than realtime. Use `pace:"realtime"` to avoid client over‑buffering.
321
+ - **L4 24GB** → slightly **below** realtime even with pre‑roll buffering, TF32/Autotune, smaller chunks (`max_decode_frames`), and the **base** checkpoint.
322
+
323
+ ### Practical guidance
324
+ - For consistent realtime, target **~40GB VRAM per active stream** (e.g., **A100 40GB**, or MIG slices ≈ **35–40GB** on newer GPUs).
325
+ - Keep client‑side **overlap‑add** (25–40 ms) for seamless chunk joins.
326
+ - Prefer **`pace:"realtime"`** once playback begins; use **ASAP** only to build a short pre‑roll if needed.
327
+ - Optional knob: **`max_decode_frames`** (default **50** ≈ 2.0 s). Reducing to **36–45** can lower per‑chunk latency/VRAM, but doesn’t increase frames/sec throughput.
328
+
329
+ ### Concurrency
330
+ This research build is designed for **one active jam per GPU**. Concurrency would require GPU partitioning (MIG) or horizontal scaling with a session scheduler.
331
+ """
332
+ )
333
+
334
+ # ------------------------------------------------------------------
335
+ # Changelog & legal
336
+ # ------------------------------------------------------------------
337
+ with gr.Tab("🗒️ Changelog & Legal"):
338
+ gr.Markdown(
339
+ r"""
340
+ ### Recent changes
341
+ - New **WebSocket realtime** route: `/ws/jam` (`mode:"rt"`)
342
+ - Added server pacing flag: `pace: "realtime" | "asap"`
343
+ - Exposed `max_decode_frames` for shorter chunks on smaller GPUs
344
+ - Client test page now does proper **overlap‑add** crossfade between chunks
345
+
346
+ ### Licensing
347
+ This project uses MagentaRT under:
348
+ - **Code:** Apache 2.0
349
+ - **Model weights:** CC‑BY 4.0
350
+ Please review the MagentaRT repo for full terms.
351
+ """
352
+ )
353
+
354
+ gr.Markdown(
355
+ r"""
356
+ ---
357
+ **🔬 Research Project** | **📱 iOS/Web Development** | **🎵 Powered by MagentaRT**
358
+ """
359
+ )
360
+
361
  return interface
362
 
363
  jam_registry: dict[str, JamWorker] = {}