dev-mode-orpheus

Paused

App Files Files Community

Tomtom84 commited on Apr 21

Commit

96dc59a

verified ·

1 Parent(s): 95e254f

Update app.py

Browse files

Files changed (1) hide show

app.py +90 -48

app.py CHANGED Viewed

@@ -18,6 +18,7 @@ model = None
 snac = None
 masker = None
 stopping_criteria = None
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
@@ -33,7 +34,7 @@ REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
 # CHUNK_TOKENS = 50 # Not directly used by us with the streamer approach
 START_TOKEN = 128259
 NEW_BLOCK = 128257
-EOS_TOKEN = 128258
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
@@ -41,45 +42,61 @@ CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
         # Allow NEW_BLOCK and all valid audio tokens initially
-        self.allow = torch.cat([
-            torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long),
-            audio_ids
-        ], dim=0)
-        self.eos = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
-        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0)
         self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
         mask = torch.full_like(scores, float("-inf"))
         mask[:, current_allow] = 0
         return scores + mask
     def reset(self):
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
-    def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
         self.buf: list[int] = []
         self.tasks = set()
@@ -105,7 +122,6 @@ class AudioStreamer(BaseStreamer):
             code_val_6 = block7[6] % CODEBOOK_SIZE
             # --- Map the extracted code values to the SNAC codebooks (l1, l2, l3) ---
-            # Using the structure from the user's previous version, believed to be correct
             l1 = [code_val_0]
             l2 = [code_val_1, code_val_4]
             l3 = [code_val_2, code_val_3, code_val_5, code_val_6]
@@ -130,15 +146,12 @@ class AudioStreamer(BaseStreamer):
         # --- Decode using SNAC ---
         try:
             with torch.no_grad():
-                # self.snac should already be on self.device from load_models_startup
-                audio = self.snac.decode(codes)[0] # Decode expects list of tensors, result might have batch dim
         except Exception as e_decode:
-            # Add more detailed logging here if it fails again
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
             print(f"Input codes shapes: {[c.shape for c in codes]}")
             print(f"Input codes dtypes: {[c.dtype for c in codes]}")
             print(f"Input codes devices: {[c.device for c in codes]}")
-            # Avoid printing potentially huge lists, maybe just check min/max?
             print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
             return b""
@@ -160,7 +173,12 @@ class AudioStreamer(BaseStreamer):
         except WebSocketDisconnect:
             print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
-            print(f"Streamer: Error sending bytes: {e}")
     def put(self, value: torch.LongTensor):
         """
@@ -169,30 +187,34 @@ class AudioStreamer(BaseStreamer):
         """
         if value.numel() == 0:
             return
-        new_token_ids = value.squeeze().tolist()
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
-            if t == EOS_TOKEN:
-                break
             if t == NEW_BLOCK:
                 self.buf.clear()
                 continue
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
-            # else: # Optionally log ignored tokens
-                # print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
                 self.buf.clear()
                 if audio_bytes:
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
                     if self.masker.sent_blocks == 0:
                         self.masker.sent_blocks = 1
@@ -201,7 +223,6 @@ class AudioStreamer(BaseStreamer):
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
-        # print(f"Streamer: Generation finished. Pending send tasks: {len(self.tasks)}")
         pass
 # 5) FastAPI App ------------------------------------------------------
@@ -209,7 +230,7 @@ app = FastAPI()
 @app.on_event("startup")
 async def load_models_startup():
-    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
@@ -218,7 +239,7 @@ async def load_models_startup():
     print("Tokenizer loaded.")
     snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
-    print(f"SNAC loaded to {device}.") # Use the global device variable
     model_dtype = torch.float32
     if device == "cuda":
@@ -235,25 +256,40 @@ async def load_models_startup():
         torch_dtype=model_dtype,
         low_cpu_mem_usage=True,
     )
-    model.config.pad_token_id = model.config.eos_token_id
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
     audio_ids_device = AUDIO_IDS_CPU.to(device)
-    masker = AudioMask(audio_ids_device, NEW_BLOCK, EOS_TOKEN)
     print("AudioMask initialized.")
-    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(EOS_TOKEN)])
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
-    print(f"Tokenizer EOS ID: {tok.eos_token_id}")
-    print(f"Model Config EOS ID: {model.config.eos_token_id}")
-    print(f"Constant EOS_TOKEN: {EOS_TOKEN}")
-    if tok.eos_token_id != EOS_TOKEN or model.config.eos_token_id != EOS_TOKEN:
-        print("⚠️ WARNING: EOS_TOKEN constant might not match model/tokenizer configuration!")
-    # Consider updating EOS_TOKEN if they differ, e.g.:
-    # EOS_TOKEN = model.config.eos_token_id
 @app.get("/")
 def hello():
@@ -277,6 +313,7 @@ def build_prompt(text: str, voice: str) -> tuple[torch.Tensor, torch.Tensor]:
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
@@ -297,24 +334,27 @@ async def tts(ws: WebSocket):
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
-        streamer = AudioStreamer(ws, snac, masker, main_loop, device)
         print("Starting generation in background thread...")
         await asyncio.to_thread(
-        model.generate,
-        input_ids=ids,
-        attention_mask=attn,
-        max_new_tokens=2500, # Keep or increase later if needed
-        logits_processor=[masker],
-        stopping_criteria=stopping_criteria,
-        # --- Changes ---
-        do_sample=True,      # Enable sampling
-        temperature=0.6,     # Introduce some randomness (adjust as needed)
-        top_p=0.9,           # Focus sampling on more likely tokens (adjust as needed)
-        repetition_penalty=1.15, # Penalize recently generated tokens (adjust > 1.0)
-        # --- End Changes ---
-        use_cache=True,
-        streamer=streamer
         )
         print("Generation thread finished.")
@@ -347,7 +387,9 @@ async def tts(ws: WebSocket):
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
-                 print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
         elif ws.client_state.name != "DISCONNECTED":

 snac = None
 masker = None
 stopping_criteria = None
+actual_eos_token_id = None # Will be determined during startup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # 0) Login + Device ---------------------------------------------------
 # CHUNK_TOKENS = 50 # Not directly used by us with the streamer approach
 START_TOKEN = 128259
 NEW_BLOCK = 128257
+# EOS_TOKEN = 128258 # REMOVED - Will be determined from model/tokenizer config
 AUDIO_BASE = 128266
 AUDIO_SPAN = 4096 * 7  # 28672 Codes
 CODEBOOK_SIZE = 4096  # Explicitly define the codebook size
 AUDIO_IDS_CPU = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN)
 # 2) Logit‑Mask -------------------------------------------------------
+# Uses the dynamically determined EOS token ID
 class AudioMask(LogitsProcessor):
     def __init__(self, audio_ids: torch.Tensor, new_block_token_id: int, eos_token_id: int):
         super().__init__()
+        # Ensure input tensors are Long type for concatenation if needed, although indices are usually int
+        new_block_tensor = torch.tensor([new_block_token_id], device=audio_ids.device, dtype=torch.long)
+        eos_tensor = torch.tensor([eos_token_id], device=audio_ids.device, dtype=torch.long)
         # Allow NEW_BLOCK and all valid audio tokens initially
+        self.allow = torch.cat([new_block_tensor, audio_ids], dim=0)
+        self.eos = eos_tensor # Store EOS token ID as tensor
+        self.allow_with_eos = torch.cat([self.allow, self.eos], dim=0) # Precompute combined tensor
         self.sent_blocks = 0 # State: Number of audio blocks sent
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # Determine which tokens are allowed based on whether blocks have been sent
         current_allow = self.allow_with_eos if self.sent_blocks > 0 else self.allow
+        # Create a mask initialized to negative infinity
         mask = torch.full_like(scores, float("-inf"))
+        # Set allowed token scores to 0 (effectively allowing them)
         mask[:, current_allow] = 0
+        # Apply the mask to the scores
         return scores + mask
     def reset(self):
+        """Resets the state for a new generation request."""
         self.sent_blocks = 0
 # 3) StoppingCriteria für EOS ---------------------------------------
+# Uses the dynamically determined EOS token ID
 class EosStoppingCriteria(StoppingCriteria):
     def __init__(self, eos_token_id: int):
         self.eos_token_id = eos_token_id
+        if self.eos_token_id is None:
+             print("⚠️ EosStoppingCriteria initialized with eos_token_id=None!")
     def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
+        if self.eos_token_id is None:
+             return False # Cannot stop if EOS ID is unknown
+        # Check if the *last* generated token is the EOS token
         if input_ids.shape[1] > 0 and input_ids[:, -1] == self.eos_token_id:
+            # print("StoppingCriteria: EOS detected.")
             return True
         return False
 # 4) Benutzerdefinierter AudioStreamer -------------------------------
 class AudioStreamer(BaseStreamer):
+    def __init__(self, ws: WebSocket, snac_decoder: SNAC, audio_mask: AudioMask, loop: asyncio.AbstractEventLoop, target_device: str, eos_token_id: int):
         self.ws = ws
         self.snac = snac_decoder
         self.masker = audio_mask
         self.loop = loop
         self.device = target_device
+        self.eos_token_id = eos_token_id # Store EOS ID for potential use in put (optional)
         self.buf: list[int] = []
         self.tasks = set()
             code_val_6 = block7[6] % CODEBOOK_SIZE
             # --- Map the extracted code values to the SNAC codebooks (l1, l2, l3) ---
             l1 = [code_val_0]
             l2 = [code_val_1, code_val_4]
             l3 = [code_val_2, code_val_3, code_val_5, code_val_6]
         # --- Decode using SNAC ---
         try:
             with torch.no_grad():
+                audio = self.snac.decode(codes)[0]
         except Exception as e_decode:
             print(f"Streamer Error: Exception during snac.decode: {e_decode}")
             print(f"Input codes shapes: {[c.shape for c in codes]}")
             print(f"Input codes dtypes: {[c.dtype for c in codes]}")
             print(f"Input codes devices: {[c.device for c in codes]}")
             print(f"Input code values (min/max): L1({min(l1)}/{max(l1)}) L2({min(l2)}/{max(l2)}) L3({min(l3)}/{max(l3)})")
             return b""
         except WebSocketDisconnect:
             print("Streamer: WebSocket disconnected during send.")
         except Exception as e:
+            # Handle cases where sending fails after connection closed
+            if "Cannot call \"send\" once a close message has been sent" in str(e):
+                 # This is expected if client disconnects during generation, suppress repetitive logs
+                 pass
+            else:
+                 print(f"Streamer: Error sending bytes: {e}")
     def put(self, value: torch.LongTensor):
         """
         """
         if value.numel() == 0:
             return
+        # Ensure value is on CPU and flatten to a list of ints
+        new_token_ids = value.squeeze().cpu().tolist()
         if isinstance(new_token_ids, int):
             new_token_ids = [new_token_ids]
         for t in new_token_ids:
+            # No need to check for EOS here, StoppingCriteria handles it
             if t == NEW_BLOCK:
                 self.buf.clear()
                 continue
             if AUDIO_BASE <= t < AUDIO_BASE + AUDIO_SPAN:
                 self.buf.append(t - AUDIO_BASE) # Store value relative to base
+            # else: # Optionally log ignored tokens outside audio range
+                # if t != self.eos_token_id: # Don't warn about the EOS token itself
+                #      print(f"Streamer Warning: Ignoring unexpected token {t}")
             if len(self.buf) == 7:
                 audio_bytes = self._decode_block(self.buf)
                 self.buf.clear()
                 if audio_bytes:
+                    # Schedule the async send function to run on the main event loop
                     future = asyncio.run_coroutine_threadsafe(self._send_audio_bytes(audio_bytes), self.loop)
                     self.tasks.add(future)
                     future.add_done_callback(self.tasks.discard)
+                    # Allow EOS only after the first full block has been processed
                     if self.masker.sent_blocks == 0:
                         self.masker.sent_blocks = 1
         if len(self.buf) > 0:
             print(f"Streamer: End of generation with incomplete block ({len(self.buf)} tokens). Discarding.")
             self.buf.clear()
         pass
 # 5) FastAPI App ------------------------------------------------------
 @app.on_event("startup")
 async def load_models_startup():
+    global tok, model, snac, masker, stopping_criteria, device, AUDIO_IDS_CPU, actual_eos_token_id
     print(f"🚀 Starting up on device: {device}")
     print("⏳ Lade Modelle …", flush=True)
     print("Tokenizer loaded.")
     snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
+    print(f"SNAC loaded to {device}.")
     model_dtype = torch.float32
     if device == "cuda":
         torch_dtype=model_dtype,
         low_cpu_mem_usage=True,
     )
     print(f"Model loaded to {model.device} with dtype {model.dtype}.")
     model.eval()
+    # --- Determine and set the correct EOS token ID ---
+    conf_eos = model.config.eos_token_id
+    tok_eos = tok.eos_token_id
+    print(f"Model Config EOS ID: {conf_eos}")
+    print(f"Tokenizer EOS ID: {tok_eos}")
+    if conf_eos is not None:
+        actual_eos_token_id = conf_eos
+    elif tok_eos is not None:
+        actual_eos_token_id = tok_eos
+        print(f"⚠️ Model config EOS ID is None, using Tokenizer EOS ID: {actual_eos_token_id}")
+    else:
+        raise ValueError("Could not determine EOS token ID from model config or tokenizer.")
+    print(f"Using EOS Token ID: {actual_eos_token_id}")
+    # Set pad_token_id to eos_token_id if not already set (common practice for generation)
+    if model.config.pad_token_id is None:
+         print(f"Setting model.config.pad_token_id to EOS token ID ({actual_eos_token_id})")
+         model.config.pad_token_id = actual_eos_token_id
+    # --- End EOS Token ID determination ---
     audio_ids_device = AUDIO_IDS_CPU.to(device)
+    # Pass the determined EOS ID to the mask
+    masker = AudioMask(audio_ids_device, NEW_BLOCK, actual_eos_token_id)
     print("AudioMask initialized.")
+    # Pass the determined EOS ID to the stopping criteria
+    stopping_criteria = StoppingCriteriaList([EosStoppingCriteria(actual_eos_token_id)])
     print("StoppingCriteria initialized.")
     print("✅ Modelle geladen und bereit!", flush=True)
 @app.get("/")
 def hello():
 # 7) WebSocket‑Endpoint (vereinfacht mit Streamer) ---------------------
 @app.websocket("/ws/tts")
 async def tts(ws: WebSocket):
+    global actual_eos_token_id # Ensure we can access the determined EOS ID
     await ws.accept()
     print("🔌 Client connected")
     streamer = None
         print(f"Generating audio for: '{text}' with voice '{voice}'")
         ids, attn = build_prompt(text, voice)
         masker.reset()
+        # Pass the determined EOS ID to the streamer as well (optional, for logging/checks)
+        streamer = AudioStreamer(ws, snac, masker, main_loop, device, actual_eos_token_id)
         print("Starting generation in background thread...")
+        # Use sampling parameters to avoid repetition
         await asyncio.to_thread(
+            model.generate,
+            input_ids=ids,
+            attention_mask=attn,
+            max_new_tokens=2500, # Increased slightly, adjust as needed
+            logits_processor=[masker],
+            stopping_criteria=stopping_criteria,
+            # --- Sampling Parameters ---
+            do_sample=True,
+            temperature=0.6,
+            top_p=0.9,
+            repetition_penalty=1.15,
+            # --- End Sampling Parameters ---
+            use_cache=True,
+            streamer=streamer,
+            eos_token_id=actual_eos_token_id # Explicitly pass correct EOS ID here too
         )
         print("Generation thread finished.")
             try:
                 await ws.close(code=1000)
             except RuntimeError as e_close:
+                 # Suppress "Cannot call 'send'..." error during final close if already disconnected
+                 if "Cannot call \"send\"" not in str(e_close):
+                      print(f"Runtime error closing websocket: {e_close}")
             except Exception as e_close_final:
                  print(f"Error closing websocket: {e_close_final}")
         elif ws.client_state.name != "DISCONNECTED":