dev-mode-orpheus

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

325e9ba

verified ·

1 Parent(s): 9a2b198

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -29

app.py CHANGED Viewed

@@ -11,11 +11,11 @@ if HF_TOKEN:
     login(HF_TOKEN)
 device = "cuda" if torch.cuda.is_available() else "cpu"
-torch.backends.cuda.enable_flash_sdp(False)          # PyTorch‑2.2‑Bug
 # 1) Konstanten -------------------------------------------------------
 REPO           = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
-CHUNK_TOKENS   = 7
 START_TOKEN    = 128259
 NEW_BLOCK      = 128257
 EOS_TOKEN      = 128258
@@ -101,45 +101,107 @@ async def tts(ws: WebSocket):
         ids, attn = build_prompt(text, voice)
         past      = None
-        offset_len = ids.size(1)
-        past       = None
-        last_tok   = None
-        buf        = []
         while True:
-            next_cache_pos = torch.tensor([past.get_seq_length()], device=device)
-            gen = model.generate(
-                input_ids       = ids if past is None else torch.tensor([[last_tok]], device=device),
-                attention_mask  = attn if past is None else None,
-                past_key_values = past,
-                cache_position  = next_cache_pos,
-                max_new_tokens  = CHUNK_TOKENS,
-                logits_processor=[masker],
-                do_sample=True, temperature=0.7, top_p=0.95,
-                use_cache=True, return_dict_in_generate=True,
-                return_legacy_cache=False
-            )
-            # neu erzeugte Tokens hinter dem bisherigen Ende
-            new_tokens = gen.sequences[0, offset_len:].tolist()
-            if not new_tokens:
                 break
-            offset_len += len(new_tokens)            # Cache ist jetzt größer
-            past        = gen.past_key_values        # Cache zurück für nächste Runde
-            last_tok    = new_tokens[-1]
-            for t in new_tokens:
                 if t == EOS_TOKEN:
-                    raise StopIteration
                 if t == NEW_BLOCK:
                     buf.clear()
                     continue
-                buf.append(t - AUDIO_BASE)
                 if len(buf) == 7:
                     await ws.send_bytes(decode_block(buf))
                     buf.clear()
-                    masker.sent_blocks = 1            # ab jetzt darf EOS
     except (StopIteration, WebSocketDisconnect):
         pass

     login(HF_TOKEN)
 device = "cuda" if torch.cuda.is_available() else "cpu"
+#torch.backends.cuda.enable_flash_sdp(False)          # PyTorch‑2.2‑Bug
 # 1) Konstanten -------------------------------------------------------
 REPO           = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
+CHUNK_TOKENS   = 50
 START_TOKEN    = 128259
 NEW_BLOCK      = 128257
 EOS_TOKEN      = 128258
         ids, attn = build_prompt(text, voice)
         past      = None
+        ids, attn = build_prompt(text, voice)
+        past = None # Holds the DynamicCache object from past_key_values
+        buf = []
+        last_tok = None # Initialize last_tok
         while True:
+            # Determine inputs for this iteration
+            if past is None:
+                # First iteration: Use the full prompt
+                current_input_ids = ids
+                current_attn_mask = attn
+                # DO NOT pass cache_position on the first run
+                current_cache_position = None
+            else:
+                # Subsequent iterations: Use only the last token
+                if last_tok is None:
+                    print("Error: last_tok is None before subsequent generate call.")
+                    break # Should not happen if generation proceeded
+                current_input_ids = torch.tensor([[last_tok]], device=device)
+                current_attn_mask = None # Not needed when past_key_values is provided
+                # DO NOT pass cache_position; let DynamicCache handle it
+                current_cache_position = None
+            # --- Call model.generate ---
+            try:
+                gen = model.generate(
+                    input_ids=current_input_ids,
+                    attention_mask=current_attn_mask,
+                    past_key_values=past,
+                    cache_position=current_cache_position, # Will be None after first iteration
+                    max_new_tokens=CHUNK_TOKENS,
+                    logits_processor=[masker],
+                    do_sample=True, temperature=0.7, top_p=0.95,
+                    use_cache=True,
+                    return_dict_in_generate=True,
+                    return_legacy_cache=False # Ensures DynamicCache
+                )
+            except Exception as e:
+                print(f"❌ Error during model.generate: {e}")
+                import traceback
+                traceback.print_exc()
+                break # Exit loop on generation error
+            # --- Process Output ---
+            # Get the full sequence generated *up to this point*
+            full_sequence_now = gen.sequences # Get the sequence tensor
+            # Determine the sequence length *before* this generation call using the cache
+            # If past is None, the previous length was the initial prompt length
+            prev_seq_len = past.get_seq_length() if past is not None else ids.shape
+            # The new tokens are those generated *in this call*
+            # These appear *after* the previously cached sequence length
+            # Ensure slicing is correct even if no new tokens are generated
+            if full_sequence_now.shape > prev_seq_len:
+                 new_token_ids = full_sequence_now[prev_seq_len:]
+                 new = new_token_ids.tolist() # Convert tensor to list
+            else:
+                 new = [] # No new tokens generated
+            if not new: # If no new tokens were generated, stop
+                print("No new tokens generated, stopping.")
                 break
+            # Update past_key_values for the *next* iteration
+            past = gen.past_key_values # Update the cache state
+            # Get the very last token generated in *this* call for the *next* input
+            last_tok = new[-1]
+            # ----- Token‑Handling (process the 'new' list) -----
+            eos_found = False
+            for t in new:
                 if t == EOS_TOKEN:
+                    print("EOS token encountered.")
+                    eos_found = True
+                    break # Stop processing tokens in this chunk
                 if t == NEW_BLOCK:
                     buf.clear()
                     continue
+                # Check if token is within the expected audio range
+                if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
+                    buf.append(t - AUDIO_BASE)
+                else:
+                    # Log unexpected tokens if necessary
+                    # print(f"Warning: Generated token {t} outside expected audio range.")
+                    pass # Ignore unexpected tokens for now
                 if len(buf) == 7:
                     await ws.send_bytes(decode_block(buf))
                     buf.clear()
+                    # Allow EOS only after the first full block is sent
+                    if not masker.sent_blocks:
+                         masker.sent_blocks = 1
+            if eos_found:
+                # Handle any remaining buffer content if needed (e.g., log incomplete block)
+                if len(buf) > 0:
+                     print(f"Warning: Incomplete audio block at EOS: {len(buf)} tokens. Discarding.")
+                     buf.clear()
+                break # Exit the while loop
     except (StopIteration, WebSocketDisconnect):
         pass