dev-mode-orpheus

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

d11cc63

verified ·

1 Parent(s): 96dc59a

Update app.py

Browse files

Files changed (1) hide show

app.py +53 -70

app.py CHANGED Viewed

@@ -18,7 +18,7 @@ model = None
 snac = None
 masker = None
 stopping_criteria = None
-actual_eos_token_id = None # Will be determined during startup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
@@ -31,10 +31,11 @@ if HF_TOKEN:
 # 1) Konstanten -------------------------------------------------------
 REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
-# CHUNK_TOKENS = 50 # Not directly used by us with the streamer approach
 START_TOKEN = 128259
 NEW_BLOCK = 128257
-# EOS_TOKEN = 128258 # REMOVED - Will be determined from model/tokenizer config
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
@@ -42,61 +43,51 @@ CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
-# Uses the dynamically determined EOS token ID
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
-        # Ensure input tensors are Long type for concatenation if needed, although indices are usually int
         new_block_tensor = torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long)
         eos_tensor = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
-        # Allow NEW_BLOCK and all valid audio tokens initially
         self.allow = torch.cat([new_block_tensor, audio_ids], dim=0)
-        self.eos = eos_tensor # Store EOS token ID as tensor
-        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0) # Precompute combined tensor
-        self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
-        # Determine which tokens are allowed based on whether blocks have been sent
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
-        # Create a mask initialized to negative infinity
         mask = torch.full_like(scores, float("-inf"))
-        # Set allowed token scores to 0 (effectively allowing them)
         mask[:, current_allow] = 0
-        # Apply the mask to the scores
         return scores + mask
     def reset(self):
-        """Resets the state for a new generation request."""
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
-# Uses the dynamically determined EOS token ID
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
-        if self.eos_token_id is None:
-             print("⚠️ EosStoppingCriteria initialized with eos_token_id=None!")
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         if self.eos_token_id is None:
-             return False # Cannot stop if EOS ID is unknown
-        # Check if the *last* generated token is the EOS token
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
-            # print("StoppingCriteria: EOS detected.")
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
     def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str, eos_token_id: int):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
-        self.eos_token_id = eos_token_id # Store EOS ID for potential use in put (optional)
         self.buf: list[int] = []
         self.tasks = set()
@@ -108,8 +99,8 @@ class AudioStreamer(BaseStreamer):
               Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
         """
         if len(block7) != 7:
-            print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
-            return b""
         try:
             # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
@@ -129,7 +120,7 @@ class AudioStreamer(BaseStreamer):
         except IndexError:
             print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
             return b""
-        except Exception as e_map: # Catch potential issues with modulo/mapping
             print(f"Streamer Error: Exception during code value extraction/mapping: {e_map}. Block: {block7}")
             return b""
@@ -149,10 +140,7 @@ class AudioStreamer(BaseStreamer):
                 audio = self.snac.decode(codes)[0]
         except Exception as e_decode:
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
-            print(f"Input codes shapes: {[c.shape for c in codes]}")
-            print(f"Input codes dtypes: {[c.dtype for c in codes]}")
-            print(f"Input codes devices: {[c.device for c in codes]}")
-            print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
             return b""
         # --- Post-processing ---
@@ -171,10 +159,12 @@ class AudioStreamer(BaseStreamer):
         try:
             await self.ws.send_bytes(data)
         except WebSocketDisconnect:
-            print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
-            # Handle cases where sending fails after connection closed
-            if "Cannot call \"send\" once a close message has been sent" in str(e):
                  # This is expected if client disconnects during generation, suppress repetitive logs
                  pass
             else:
@@ -187,7 +177,6 @@ class AudioStreamer(BaseStreamer):
         """
         if value.numel() == 0:
             return
-        # Ensure value is on CPU and flatten to a list of ints
         new_token_ids = value.squeeze().cpu().tolist()
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
@@ -198,23 +187,22 @@ class AudioStreamer(BaseStreamer):
                 self.buf.clear()
                 continue
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
-            # else: # Optionally log ignored tokens outside audio range
-                # if t != self.eos_token_id: # Don't warn about the EOS token itself
-                #      print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
                 self.buf.clear()
                 if audio_bytes:
-                    # Schedule the async send function to run on the main event loop
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
-                    # Allow EOS only after the first full block has been processed
                     if self.masker.sent_blocks == 0:
                         self.masker.sent_blocks = 1
@@ -230,7 +218,8 @@ app = FastAPI()
 @app.on_event("startup")
 async def load_models_startup():
-    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU, actual_eos_token_id
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
@@ -259,34 +248,28 @@ async def load_models_startup():
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
-    # --- Determine and set the correct EOS token ID ---
     conf_eos = model.config.eos_token_id
     tok_eos = tok.eos_token_id
     print(f"Model Config EOS ID: {conf_eos}")
     print(f"Tokenizer EOS ID: {tok_eos}")
-    if conf_eos is not None:
-        actual_eos_token_id = conf_eos
-    elif tok_eos is not None:
-        actual_eos_token_id = tok_eos
-        print(f"⚠️ Model config EOS ID is None, using Tokenizer EOS ID: {actual_eos_token_id}")
-    else:
-        raise ValueError("Could not determine EOS token ID from model config or tokenizer.")
-    print(f"Using EOS Token ID: {actual_eos_token_id}")
-    # Set pad_token_id to eos_token_id if not already set (common practice for generation)
     if model.config.pad_token_id is None:
-         print(f"Setting model.config.pad_token_id to EOS token ID ({actual_eos_token_id})")
-         model.config.pad_token_id = actual_eos_token_id
-    # --- End EOS Token ID determination ---
     audio_ids_device = AUDIO_IDS_CPU.to(device)
-    # Pass the determined EOS ID to the mask
-    masker = AudioMask(audio_ids_device, NEW_BLOCK, actual_eos_token_id)
     print("AudioMask initialized.")
-    # Pass the determined EOS ID to the stopping criteria
-    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(actual_eos_token_id)])
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
@@ -313,7 +296,7 @@ def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
-    global actual_eos_token_id # Ensure we can access the determined EOS ID
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
@@ -334,27 +317,28 @@ async def tts(ws: WebSocket):
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
-        # Pass the determined EOS ID to the streamer as well (optional, for logging/checks)
-        streamer = AudioStreamer(ws, snac, masker, main_loop, device, actual_eos_token_id)
         print("Starting generation in background thread...")
-        # Use sampling parameters to avoid repetition
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
-            max_new_tokens=2500, # Increased slightly, adjust as needed
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
-            # --- Sampling Parameters ---
             do_sample=True,
-            temperature=0.6,
-            top_p=0.9,
-            repetition_penalty=1.15,
             # --- End Sampling Parameters ---
             use_cache=True,
             streamer=streamer,
-            eos_token_id=actual_eos_token_id # Explicitly pass correct EOS ID here too
         )
         print("Generation thread finished.")
@@ -387,8 +371,7 @@ async def tts(ws: WebSocket):
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
-                 # Suppress "Cannot call 'send'..." error during final close if already disconnected
-                 if "Cannot call \"send\"" not in str(e_close):
                       print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")

 snac = None
 masker = None
 stopping_criteria = None
+# actual_eos_token_id = None # Reverted to constant below
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
 # 1) Konstanten -------------------------------------------------------
 REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
 START_TOKEN = 128259
 NEW_BLOCK = 128257
+# --- Reverted to using the hardcoded EOS token based on user belief ---
+EOS_TOKEN = 128258
+# --- End Reverted EOS Token ---
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
+# Uses the constant EOS_TOKEN
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
         new_block_tensor = torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long)
         eos_tensor = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
         self.allow = torch.cat([new_block_tensor, audio_ids], dim=0)
+        self.eos = eos_tensor
+        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
+        self.sent_blocks = 0
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
         mask = torch.full_like(scores, float("-inf"))
         mask[:, current_allow] = 0
         return scores + mask
     def reset(self):
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
+# Uses the constant EOS_TOKEN
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
+        # No warning needed here as we are intentionally using the constant
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         if self.eos_token_id is None:
+             return False
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
+            print(f"StoppingCriteria: EOS detected (ID: {self.eos_token_id}).") # Add log
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
+    # Pass the constant EOS_TOKEN here too
     def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str, eos_token_id: int):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
+        self.eos_token_id = eos_token_id # Store constant EOS ID
         self.buf: list[int] = []
         self.tasks = set()
               Maps extracted values using the structure potentially correct for Kartoffel_Orpheus.
         """
         if len(block7) != 7:
+            # print(f"Streamer Warning: _decode_block received {len(block7)} tokens, expected 7. Skipping.")
+            return b"" # Less verbose logging
         try:
             # --- Extract base code value (0 to CODEBOOK_SIZE-1) for each slot using modulo ---
         except IndexError:
             print(f"Streamer Error: Index out of bounds during token mapping. Block: {block7}")
             return b""
+        except Exception as e_map:
             print(f"Streamer Error: Exception during code value extraction/mapping: {e_map}. Block: {block7}")
             return b""
                 audio = self.snac.decode(codes)[0]
         except Exception as e_decode:
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
+            # Add more details if needed, e.g., shapes: {[c.shape for c in codes]}
             return b""
         # --- Post-processing ---
         try:
             await self.ws.send_bytes(data)
         except WebSocketDisconnect:
+             # This is expected if client disconnects first, don't log error
+             # print("Streamer: WebSocket disconnected during send.")
+             pass
         except Exception as e:
+            if "Cannot call \"send\" once a close message has been sent" in str(e) or \
+               "Connection is closed" in str(e):
                  # This is expected if client disconnects during generation, suppress repetitive logs
                  pass
             else:
         """
         if value.numel() == 0:
             return
         new_token_ids = value.squeeze().cpu().tolist()
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
                 self.buf.clear()
                 continue
+            # Use the constant EOS_TOKEN for comparison if needed (e.g. for logging)
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
+            # else: # Optionally log ignored tokens
+            #     if t != self.eos_token_id: # Don't warn about the EOS token itself
+            #          print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
                 self.buf.clear()
                 if audio_bytes:
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
                     if self.masker.sent_blocks == 0:
                         self.masker.sent_blocks = 1
 @app.on_event("startup")
 async def load_models_startup():
+    # Keep global references, but EOS_TOKEN is now a constant again
+    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
+    # --- Print comparison for EOS token IDs but use the constant ---
     conf_eos = model.config.eos_token_id
     tok_eos = tok.eos_token_id
     print(f"Model Config EOS ID: {conf_eos}")
     print(f"Tokenizer EOS ID: {tok_eos}")
+    print(f"Using Constant EOS_TOKEN: {EOS_TOKEN}") # State the used constant
+    if conf_eos != EOS_TOKEN or tok_eos != EOS_TOKEN:
+         print(f"⚠️ WARNING: Constant EOS_TOKEN {EOS_TOKEN} differs from model/tokenizer IDs ({conf_eos}/{tok_eos}).")
+    # --- End EOS comparison ---
+    # Set pad_token_id if None (use the constant EOS)
     if model.config.pad_token_id is None:
+         print(f"Setting model.config.pad_token_id to Constant EOS token ID ({EOS_TOKEN})")
+         model.config.pad_token_id = EOS_TOKEN
     audio_ids_device = AUDIO_IDS_CPU.to(device)
+    # Pass the constant EOS_TOKEN to the mask
+    masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN)
     print("AudioMask initialized.")
+    # Pass the constant EOS_TOKEN to the stopping criteria
+    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)])
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
+    # No need for global actual_eos_token_id
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
+        # Pass the constant EOS_TOKEN to streamer
+        streamer = AudioStreamer(ws, snac, masker, main_loop, device, EOS_TOKEN)
         print("Starting generation in background thread...")
+        # Use sampling parameters with anti-repetition measures
         await asyncio.to_thread(
             model.generate,
             input_ids=ids,
             attention_mask=attn,
+            max_new_tokens=2500, # Or adjust as needed
             logits_processor=[masker],
             stopping_criteria=stopping_criteria,
+            # --- Sampling Parameters with Anti-Repetition ---
             do_sample=True,
+            temperature=0.6,         # Adjust if needed
+            top_p=0.9,             # Adjust if needed
+            repetition_penalty=1.2,  # Increased (experiment!)
+            no_repeat_ngram_size=4,  # Added (experiment!)
             # --- End Sampling Parameters ---
             use_cache=True,
             streamer=streamer,
+            eos_token_id=EOS_TOKEN # Explicitly pass constant EOS ID
         )
         print("Generation thread finished.")
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
+                 if "Cannot call \"send\"" not in str(e_close) and "Connection is closed" not in str(e_close):
                       print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")