Spaces:
Paused
Paused
Update app.py
Browse files
app.py
CHANGED
@@ -122,15 +122,15 @@ async def tts(ws: WebSocket):
|
|
122 |
# Update buffer_pos based on current buffer length before generation
|
123 |
masker.buffer_pos = len(buf)
|
124 |
|
125 |
-
# --- Mini‑Generate (Cache
|
126 |
gen = model.generate(
|
127 |
-
input_ids
|
128 |
-
attention_mask
|
129 |
-
past_key_values
|
130 |
max_new_tokens = CHUNK_TOKENS,
|
131 |
logits_processor=[masker],
|
132 |
do_sample=True, temperature=0.7, top_p=0.95,
|
133 |
-
use_cache=
|
134 |
return_dict_in_generate=True,
|
135 |
return_legacy_cache=True
|
136 |
)
|
@@ -142,10 +142,10 @@ async def tts(ws: WebSocket):
|
|
142 |
break
|
143 |
offset_len += len(new)
|
144 |
|
145 |
-
# ----- Update
|
146 |
-
|
147 |
-
|
148 |
-
past = gen.past_key_values #
|
149 |
last_tok = new[-1]
|
150 |
|
151 |
print("new tokens:", new[:25], flush=True)
|
|
|
122 |
# Update buffer_pos based on current buffer length before generation
|
123 |
masker.buffer_pos = len(buf)
|
124 |
|
125 |
+
# --- Mini‑Generate (Cache Disabled for Debugging) -------------------------------------------
|
126 |
gen = model.generate(
|
127 |
+
input_ids = ids, # Always use full sequence
|
128 |
+
attention_mask = attn, # Always use full attention mask
|
129 |
+
# past_key_values= past, # Disabled cache
|
130 |
max_new_tokens = CHUNK_TOKENS,
|
131 |
logits_processor=[masker],
|
132 |
do_sample=True, temperature=0.7, top_p=0.95,
|
133 |
+
use_cache=False, # Disabled cache
|
134 |
return_dict_in_generate=True,
|
135 |
return_legacy_cache=True
|
136 |
)
|
|
|
142 |
break
|
143 |
offset_len += len(new)
|
144 |
|
145 |
+
# ----- Update ids and attn with the full sequence (Cache Disabled) ---------
|
146 |
+
ids = torch.tensor([seq], device=device) # Re-added
|
147 |
+
attn = torch.ones_like(ids) # Re-added
|
148 |
+
# past = gen.past_key_values # Disabled cache access
|
149 |
last_tok = new[-1]
|
150 |
|
151 |
print("new tokens:", new[:25], flush=True)
|