Tomtom84's picture
Update app.py
7f32a0e verified
raw
history blame
8.29 kB
# app.py ──────────────────────────────────────────────────────────────
import os, json, torch, asyncio
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from huggingface_hub import login
from transformers import AutoTokenizer, AutoModelForCausalLM, LogitsProcessor
from snac import SNAC
# 0) Login + Device ---------------------------------------------------
HF_TOKEN = os.getenv("HF_TOKEN")
if HF_TOKEN:
login(HF_TOKEN)
device = "cuda" if torch.cuda.is_available() else "cpu"
torch.backends.cuda.enable_flash_sdp(False) # PyTorch‑2.2‑Bug
# 1) Konstanten -------------------------------------------------------
REPO = "SebastianBodza/Kartoffel_Orpheus-3B_german_natural-v0.1"
CHUNK_TOKENS = 50
START_TOKEN = 128259
NEW_BLOCK = 128257
EOS_TOKEN = 128258
AUDIO_BASE = 128266
AUDIO_SPAN = 4096 * 7 # 28 672 Codes
AUDIO_IDS = torch.arange(AUDIO_BASE, AUDIO_BASE + AUDIO_SPAN) # Renamed VALID_AUDIO to AUDIO_IDS
# 2) Logit‑Mask (NEW_BLOCK + Audio; EOS erst nach 1. Block) ----------
class AudioMask(LogitsProcessor):
def __init__(self, audio_ids: torch.Tensor):
super().__init__()
self.allow = torch.cat([
torch.tensor([NEW_BLOCK], device=audio_ids.device),
audio_ids
])
self.eos = torch.tensor([EOS_TOKEN], device=audio_ids.device)
self.sent_blocks = 0
self.buffer_pos = 0 # Added buffer position
def __call__(self, input_ids, logits):
# Calculate allowed tokens based on buffer position
start_token = AUDIO_BASE + self.buffer_pos * 4096
end_token = start_token + 4096
allowed_audio = torch.arange(start_token, end_token, device=self.allow.device)
# Only allow NEW_BLOCK if buffer is full, otherwise only allow audio tokens
if self.buffer_pos == 7:
allowed = torch.cat([
torch.tensor([NEW_BLOCK], device=self.allow.device),
allowed_audio
])
else:
allowed = allowed_audio # Only allow audio tokens
if self.sent_blocks: # ab 1. Block EOS zulassen
allowed = torch.cat([allowed, self.eos])
mask = logits.new_full(logits.shape, float("-inf"))
mask = logits.new_full(logits.shape, float("-inf"))
mask[:, allowed] = 0
return logits + mask
# 3) FastAPI Grundgerüst ---------------------------------------------
app = FastAPI()
@app.get("/")
def hello():
return {"status": "ok"}
@app.on_event("startup")
def load_models():
global tok, model, snac, masker
print("⏳ Lade Modelle …", flush=True)
tok = AutoTokenizer.from_pretrained(REPO)
snac = SNAC.from_pretrained("hubertsiuzdak/snac_24khz").to(device)
model = AutoModelForCausalLM.from_pretrained(
REPO,
device_map={"": 0} if device == "cuda" else None,
torch_dtype=torch.bfloat16 if device == "cuda" else None,
low_cpu_mem_usage=True,
)
model.config.pad_token_id = model.config.eos_token_id
masker = AudioMask(AUDIO_IDS.to(device))
print("✅ Modelle geladen", flush=True)
# 4) Helper -----------------------------------------------------------
def build_prompt(text: str, voice: str):
prompt_ids = tok(f"{voice}: {text}", return_tensors="pt").input_ids.to(device)
ids = torch.cat([torch.tensor([[START_TOKEN]], device=device),
prompt_ids,
torch.tensor([[128009, 128260]], device=device)], 1)
attn = torch.ones_like(ids)
return ids, attn
def decode_block(block7: list[int]) -> bytes:
l1,l2,l3=[],[],[]
l1.append(block7[0] - (AUDIO_BASE + 0 * 4096)) # Subtract AUDIO_BASE + position 0 offset
l2.append(block7[1] - (AUDIO_BASE + 1 * 4096)) # Subtract AUDIO_BASE + position 1 offset
l3 += [block7[2] - (AUDIO_BASE + 2 * 4096), block7[3] - (AUDIO_BASE + 3 * 4096)] # Subtract AUDIO_BASE + position offsets
l2.append(block7[4] - (AUDIO_BASE + 4 * 4096)) # Subtract AUDIO_BASE + position 4 offset
l3 += [block7[5] - (AUDIO_BASE + 5 * 4096), block7[6] - (AUDIO_BASE + 6 * 4096)] # Subtract AUDIO_BASE + position offsets
with torch.no_grad():
codes = [torch.tensor(x, device=device).unsqueeze(0)
for x in (l1,l2,l3)]
audio = snac.decode(codes).squeeze().detach().cpu().numpy()
return (audio*32767).astype("int16").tobytes()
# 5) WebSocket‑Endpoint ----------------------------------------------
@app.websocket("/ws/tts")
async def tts(ws: WebSocket):
await ws.accept()
try:
req = json.loads(await ws.receive_text())
text = req.get("text", "")
voice = req.get("voice", "Jakob")
ids, attn = build_prompt(text, voice)
past = None
offset_len = ids.size(1) # wie viele Tokens existieren schon
last_tok = None
buf = []
# masker.buffer_pos = 0 # Removed initialization here
while True:
# Update buffer_pos based on current buffer length before generation
masker.buffer_pos = len(buf)
# --- Mini‑Generate (Cache Disabled for Debugging) -------------------------------------------
gen = model.generate(
input_ids = ids, # Always use full sequence
attention_mask = attn, # Always use full attention mask
# past_key_values= past, # Disabled cache
max_new_tokens = CHUNK_TOKENS,
logits_processor=[masker],
do_sample=True, temperature=0.7, top_p=0.95,
use_cache=False, # Disabled cache
return_dict_in_generate=True,
return_legacy_cache=True
)
# ----- neue Tokens heraus schneiden --------------------------
seq = gen.sequences[0].tolist()
new = seq[offset_len:]
if not new: # nichts -> fertig
break
offset_len += len(new)
# ----- Update ids and attn with the full sequence (Cache Disabled) ---------
ids = torch.tensor([seq], device=device) # Re-added
attn = torch.ones_like(ids) # Re-added
# past = gen.past_key_values # Disabled cache access
last_tok = new[-1]
print("new tokens:", new[:25], flush=True)
# ----- Token‑Handling ----------------------------------------
for t in new:
if t == EOS_TOKEN: # Re-enabled EOS check
raise StopIteration # Re-enabled EOS check
if t == NEW_BLOCK:
buf.clear()
continue
# Only append if it's an audio token
# Only append if it's an audio token
if t >= AUDIO_BASE and t < AUDIO_BASE + AUDIO_SPAN:
buf.append(t - AUDIO_BASE) # Append token relative to AUDIO_BASE
# masker.buffer_pos += 1 # Removed increment here
if len(buf) == 7:
await ws.send_bytes(decode_block(buf))
buf.clear()
masker.sent_blocks = 1 # ab jetzt EOS zulässig
# masker.buffer_pos = 0 # Removed reset here
else:
# Optional: Log unexpected tokens
print(f"DEBUG: Skipping non-audio token: {t}", flush=True)
except (StopIteration, WebSocketDisconnect):
pass
except Exception as e:
print("❌ WS‑Error:", e, flush=True)
import traceback
traceback.print_exc()
if ws.client_state.name != "DISCONNECTED":
await ws.close(code=1011)
finally:
if ws.client_state.name != "DISCONNECTED":
try:
await ws.close()
except RuntimeError:
pass
# 6) Dev‑Start --------------------------------------------------------
if __name__ == "__main__":
import uvicorn, sys
uvicorn.run("app:app", host="0.0.0.0", port=7860, log_level="info")