dev-mode-orpheus

Paused

Tomtom84 commited on Apr 21

Commit

7f32a0e

verified ·

1 Parent(s): 96246a3

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -94,11 +94,11 @@ def build_prompt(text: str, voice: str):
 def decode_block(block7: list[int]) -> bytes:
     l1,l2,l3=[],[],[]
-    l1.append((block7[0] - AUDIO_BASE)) # Subtract AUDIO_BASE and position 0 offset
-    l2.append((block7[1] - AUDIO_BASE)) # Subtract AUDIO_BASE and position 1 offset
-    l3 += [(block7[2] - AUDIO_BASE), (block7[3] - AUDIO_BASE)] # Subtract AUDIO_BASE and position offsets
-    l2.append((block7[4] - AUDIO_BASE)) # Subtract AUDIO_BASE and position 4 offset
-    l3 += [(block7[5] - AUDIO_BASE), (block7[6] - AUDIO_BASE)] # Subtract AUDIO_BASE and position offsets
     with torch.no_grad():
         codes = [torch.tensor(x, device=device).unsqueeze(0)
@@ -165,7 +165,7 @@ async def tts(ws: WebSocket):
                 # Only append if it's an audio token
                 # Only append if it's an audio token
                 if t >= AUDIO_BASE and t < AUDIO_BASE + AUDIO_SPAN:
-                    buf.append(t) # Append original token
                     # masker.buffer_pos += 1 # Removed increment here
                     if len(buf) == 7:
                         await ws.send_bytes(decode_block(buf))

 def decode_block(block7: list[int]) -> bytes:
     l1,l2,l3=[],[],[]
+    l1.append(block7[0] - (AUDIO_BASE + 0 * 4096)) # Subtract AUDIO_BASE + position 0 offset
+    l2.append(block7[1] - (AUDIO_BASE + 1 * 4096)) # Subtract AUDIO_BASE + position 1 offset
+    l3 += [block7[2] - (AUDIO_BASE + 2 * 4096), block7[3] - (AUDIO_BASE + 3 * 4096)] # Subtract AUDIO_BASE + position offsets
+    l2.append(block7[4] - (AUDIO_BASE + 4 * 4096)) # Subtract AUDIO_BASE + position 4 offset
+    l3 += [block7[5] - (AUDIO_BASE + 5 * 4096), block7[6] - (AUDIO_BASE + 6 * 4096)] # Subtract AUDIO_BASE + position offsets
     with torch.no_grad():
         codes = [torch.tensor(x, device=device).unsqueeze(0)
                 # Only append if it's an audio token
                 # Only append if it's an audio token
                 if t >= AUDIO_BASE and t < AUDIO_BASE + AUDIO_SPAN:
+                    buf.append(t - AUDIO_BASE) # Append token relative to AUDIO_BASE
                     # masker.buffer_pos += 1 # Removed increment here
                     if len(buf) == 7:
                         await ws.send_bytes(decode_block(buf))