dev-mode-orpheus

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

1d792aa

verified ·

1 Parent(s): cd13e5b

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -79

app.py CHANGED Viewed

@@ -18,7 +18,6 @@ model = None
 snac = None
 masker = None
 stopping_criteria = None
-# actual_eos_token_id = None # Reverted to constant below
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
@@ -33,26 +32,23 @@ if HF_TOKEN:
 REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
 START_TOKEN = 128259
 NEW_BLOCK = 128257
-# --- Reverted to using the hardcoded EOS token based on user belief ---
-#EOS_TOKEN = 128258
-# --- End Reverted EOS Token ---
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
-# Create AUDIO_IDS on the correct device later in load_models
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
-# Uses the constant EOS_TOKEN
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
-        new_block_tensor = torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long)
-        eos_tensor = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
-        self.allow = torch.cat([new_block_tensor, audio_ids], dim=0)
-        self.eos = eos_tensor
         self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
-        self.sent_blocks = 0
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
@@ -64,43 +60,36 @@ class AudioMask(LogitsProcessor):
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
-# Uses the constant EOS_TOKEN
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
-        # No warning needed here as we are intentionally using the constant
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
-        if self.eos_token_id is None:
-             return False
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
-            print(f"StoppingCriteria: EOS detected (ID: {self.eos_token_id}).") # Add log
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
-    # Pass the constant EOS_TOKEN here too
-    def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str, eos_token_id: int):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
-        self.eos_token_id = eos_token_id # Store constant EOS ID
         self.buf: list[int] = []
         self.tasks = set()
     def _decode_block(self, block7: list[int]) -> bytes:
         """
         Decodes a block of 7 audio token values (AUDIO_BASE subtracted) into audio bytes.
-        NOTE: Extracts base code value (0-4095) using modulo, assuming
-              input values represent (slot_offset + code_value).
-              Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
         """
         if len(block7) != 7:
-            # print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
-            return b"" # Less verbose logging
         try:
             # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
@@ -140,7 +129,10 @@ class AudioStreamer(BaseStreamer):
                 audio = self.snac.decode(codes)[0]
         except Exception as e_decode:
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
-            # Add more details if needed, e.g., shapes: {[c.shape for c in codes]}
             return b""
         # --- Post-processing ---
@@ -159,16 +151,14 @@ class AudioStreamer(BaseStreamer):
         try:
             await self.ws.send_bytes(data)
         except WebSocketDisconnect:
-             # This is expected if client disconnects first, don't log error
-             # print("Streamer: WebSocket disconnected during send.")
-             pass
         except Exception as e:
-            if "Cannot call \"send\" once a close message has been sent" in str(e) or \
-               "Connection is closed" in str(e):
-                 # This is expected if client disconnects during generation, suppress repetitive logs
-                 pass
-            else:
                  print(f"Streamer: Error sending bytes: {e}")
     def put(self, value: torch.LongTensor):
         """
@@ -177,40 +167,56 @@ class AudioStreamer(BaseStreamer):
         """
         if value.numel() == 0:
             return
-        new_token_ids = value.squeeze().cpu().tolist()
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
-            # No need to check for EOS here, StoppingCriteria handles it
             if t == NEW_BLOCK:
                 self.buf.clear()
-                continue
-            # Use the constant EOS_TOKEN for comparison if needed (e.g. for logging)
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
-            # else: # Optionally log ignored tokens
-            #     if t != self.eos_token_id: # Don't warn about the EOS token itself
-            #          print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
-                self.buf.clear()
-                if audio_bytes:
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
                     if self.masker.sent_blocks == 0:
-                        self.masker.sent_blocks = 1
     def end(self):
         """Called by generate() when generation finishes."""
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
         pass
 # 5) FastAPI App ------------------------------------------------------
@@ -218,8 +224,7 @@ app = FastAPI()
 @app.on_event("startup")
 async def load_models_startup():
-    # Keep global references, but EOS_TOKEN is now a constant again
-    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
@@ -245,31 +250,41 @@ async def load_models_startup():
         torch_dtype=model_dtype,
         low_cpu_mem_usage=True,
     )
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
-    # --- Print comparison for EOS token IDs but use the constant ---
-    conf_eos = model.config.eos_token_id
-    tok_eos = tok.eos_token_id
-    print(f"Model Config EOS ID: {conf_eos}")
-    print(f"Tokenizer EOS ID: {tok_eos}")
-    print(f"Using Constant EOS_TOKEN: {EOS_TOKEN}") # State the used constant
-    if conf_eos != EOS_TOKEN or tok_eos != EOS_TOKEN:
-         print(f"⚠️ WARNING: Constant EOS_TOKEN {EOS_TOKEN} differs from model/tokenizer IDs ({conf_eos}/{tok_eos}).")
-    # --- End EOS comparison ---
-    # Set pad_token_id if None (use the constant EOS)
-    if model.config.pad_token_id is None:
-         print(f"Setting model.config.pad_token_id to Constant EOS token ID ({EOS_TOKEN})")
-         model.config.pad_token_id = EOS_TOKEN
     audio_ids_device = AUDIO_IDS_CPU.to(device)
-    # Pass the constant EOS_TOKEN to the mask
-    masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN)
     print("AudioMask initialized.")
-    # Pass the constant EOS_TOKEN to the stopping criteria
-    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)])
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
@@ -296,7 +311,6 @@ def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
-    # No need for global actual_eos_token_id
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
@@ -317,28 +331,25 @@ async def tts(ws: WebSocket):
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
-        # Pass the constant EOS_TOKEN to streamer
-        streamer = AudioStreamer(ws, snac, masker, main_loop, device, EOS_TOKEN)
         print("Starting generation in background thread...")
-        # Use sampling parameters with anti-repetition measures
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
-            max_new_tokens=2500, # Or adjust as needed
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
-            # --- Sampling Parameters with Anti-Repetition ---
             do_sample=True,
-            temperature=0.6,         # Adjust if needed
-            top_p=0.9,             # Adjust if needed
-            repetition_penalty=1.2,  # Increased (experiment!)
-            no_repeat_ngram_size=4,  # Added (experiment!)
-            # --- End Sampling Parameters ---
             use_cache=True,
-            streamer=streamer,
-            eos_token_id=EOS_TOKEN # Explicitly pass constant EOS ID
         )
         print("Generation thread finished.")
@@ -371,8 +382,7 @@ async def tts(ws: WebSocket):
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
-                 if "Cannot call \"send\"" not in str(e_close) and "Connection is closed" not in str(e_close):
-                      print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
         elif ws.client_state.name != "DISCONNECTED":
@@ -383,4 +393,6 @@ async def tts(ws: WebSocket):
 if __name__ == "__main__":
     import uvicorn
     print("Starting Uvicorn server...")
     uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")

 snac = None
 masker = None
 stopping_criteria = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
 REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
 START_TOKEN = 128259
 NEW_BLOCK = 128257
+EOS_TOKEN = 128258 # Ensure this is correct for the model
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
+        self.allow = torch.cat([
+            torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long),
+            audio_ids
+        ], dim=0)
+        self.eos = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
         self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
+        self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
+            # print("StoppingCriteria: EOS detected.") # Optional: Uncomment for debugging
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
+    def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
         self.buf: list[int] = []
         self.tasks = set()
     def _decode_block(self, block7: list[int]) -> bytes:
         """
         Decodes a block of 7 audio token values (AUDIO_BASE subtracted) into audio bytes.
+        Uses modulo to extract base code value (0-4095).
+        Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
         """
         if len(block7) != 7:
+            print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
+            return b""
         try:
             # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
                 audio = self.snac.decode(codes)[0]
         except Exception as e_decode:
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
+            print(f"Input codes shapes: {[c.shape for c in codes]}")
+            print(f"Input codes dtypes: {[c.dtype for c in codes]}")
+            print(f"Input codes devices: {[c.device for c in codes]}")
+            print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
             return b""
         # --- Post-processing ---
         try:
             await self.ws.send_bytes(data)
         except WebSocketDisconnect:
+            print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
+            # Log errors other than expected disconnects more visibly maybe
+            if "Cannot call \"send\" once a close message has been sent" not in str(e):
                  print(f"Streamer: Error sending bytes: {e}")
+            # else: # Optionally print disconnect errors quietly
+            #    print("Streamer: Attempted send after close.")
+            pass # Avoid flooding logs if client disconnects early
     def put(self, value: torch.LongTensor):
         """
         """
         if value.numel() == 0:
             return
+        # Ensure value is on CPU and flatten to a list of ints
+        new_token_ids = value.squeeze().cpu().tolist() # Move to CPU before list conversion
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
+            # --- DEBUGGING PRINT ---
+            # Log every token ID received from the model
+            print(f"Streamer received token ID: {t}")
+            # --- END DEBUGGING ---
+            if t == EOS_TOKEN:
+                # print("Streamer: EOS token encountered.") # Optional debugging
+                break # Stop processing this batch if EOS is found
             if t == NEW_BLOCK:
+                # print("Streamer: NEW_BLOCK token encountered.") # Optional debugging
                 self.buf.clear()
+                continue # Move to the next token
+            # Check if token is within the expected audio range
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
+            # else: # Log unexpected tokens if needed
+                # print(f"Streamer Warning: Ignoring unexpected token {t} (outside audio range [{AUDIO_BASE}, {AUDIO_BASE + AUDIO_SPAN}))")
+                pass
+            # If buffer has 7 tokens, decode and send
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
+                self.buf.clear() # Clear buffer after processing
+                if audio_bytes: # Only send if decoding was successful
+                    # Schedule the async send function to run on the main event loop
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
+                    # Optional: Remove completed tasks to prevent memory leak if generation is very long
                     future.add_done_callback(self.tasks.discard)
+                    # Allow EOS only after the first full block has been processed and scheduled for sending
                     if self.masker.sent_blocks == 0:
+                        # print("Streamer: First audio block processed, allowing EOS.")
+                        self.masker.sent_blocks = 1 # Update state in the mask
     def end(self):
         """Called by generate() when generation finishes."""
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
+        # print(f"Streamer: Generation finished.") # Optional debugging
         pass
 # 5) FastAPI App ------------------------------------------------------
 @app.on_event("startup")
 async def load_models_startup():
+    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU, EOS_TOKEN
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
         torch_dtype=model_dtype,
         low_cpu_mem_usage=True,
     )
+    # --- Verify EOS Token ---
+    # Use the actual EOS token ID from the loaded model/tokenizer config
+    config_eos_id = model.config.eos_token_id
+    tokenizer_eos_id = tok.eos_token_id
+    if config_eos_id is None:
+        print("🚨 WARNING: model.config.eos_token_id is None!")
+        # Fallback or default? Let's use the constant for now, but this needs checking.
+        final_eos_token_id = EOS_TOKEN
+    elif tokenizer_eos_id is not None and config_eos_id != tokenizer_eos_id:
+         print(f"⚠️ WARNING: Mismatch! model.config.eos_token_id ({config_eos_id}) != tok.eos_token_id ({tokenizer_eos_id}). Using model config ID.")
+         final_eos_token_id = config_eos_id
+    else:
+         final_eos_token_id = config_eos_id
+    # Update the global constant if it differs or wasn't set properly by config
+    if final_eos_token_id != EOS_TOKEN:
+         print(f"🔄 Updating EOS_TOKEN constant from {EOS_TOKEN} to {final_eos_token_id}")
+         EOS_TOKEN = final_eos_token_id # Update the global constant
+    # Set pad_token_id to the determined EOS token ID
+    model.config.pad_token_id = EOS_TOKEN
+    print(f"Using EOS Token ID: {EOS_TOKEN}")
+    # --- End Verify EOS Token ---
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
     audio_ids_device = AUDIO_IDS_CPU.to(device)
+    masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN) # Use updated EOS_TOKEN
     print("AudioMask initialized.")
+    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)]) # Use updated EOS_TOKEN
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
+        streamer = AudioStreamer(ws, snac, masker, main_loop, device)
         print("Starting generation in background thread...")
+        # --- DEBUGGING: Adjusted Generation Parameters ---
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
+            max_new_tokens=1500, # Keep lower for faster debugging cycles initially
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
+            # --- Adjusted Parameters for Debugging Repetition ---
             do_sample=True,
+            temperature=0.7,     # Slightly higher temperature
+            # top_p=0.9,         # Commented out top_p for simpler testing
+            repetition_penalty=1.2, # Slightly stronger penalty
+            # --- End Adjusted Parameters ---
             use_cache=True,
+            streamer=streamer
         )
         print("Generation thread finished.")
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
+                 print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
         elif ws.client_state.name != "DISCONNECTED":
 if __name__ == "__main__":
     import uvicorn
     print("Starting Uvicorn server...")
+    # Note: Consider running with --workers 1 if you face issues with globals/GPU memory
+    # uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info", workers=1)
     uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")