dev-mode-python / app.py
Tomtom84's picture
upd30
a9f2d93
raw
history blame
2.44 kB
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
from fastapi.responses import StreamingResponse, JSONResponse
import outetts
import io
import json
import base64
# Initialize the interface
interface = outetts.Interface(
config=outetts.ModelConfig.auto_config(
model=outetts.Models.VERSION_1_0_SIZE_1B,
# For llama.cpp backend
#backend=outetts.Backend.LLAMACPP,
#quantization=outetts.LlamaCppQuantization.FP16
# For transformers backend
backend=outetts.Backend.HF,
)
)
# Load the default speaker profile
speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL")
app = FastAPI()
@app.get("/")
def greet_json():
return {"Hello": "World!"}
@app.websocket("/ws/tts")
async def websocket_tts(websocket: WebSocket):
await websocket.accept()
try:
while True:
# Empfange Text-Chunk vom Client
data = await websocket.receive_text()
# Status: Warming up
await websocket.send_text(json.dumps({"generation_status": "Warming up TTS model"}))
output = interface.generate(
config=outetts.GenerationConfig(
text=data,
generation_type=outetts.GenerationType.CHUNKED,
speaker=speaker,
sampler_config=outetts.SamplerConfig(
temperature=0.4
),
)
)
# Status: Generating linguistic features
await websocket.send_text(json.dumps({"generation_status": "Generating linguistic features"}))
# Stream audio chunks
for chunk in output.stream(chunk_size=4096):
audio_b64 = base64.b64encode(chunk).decode("ascii")
await websocket.send_text(json.dumps({
"data": {
"audio_bytes": audio_b64,
"duration": None,
"request_finished": False
}
}))
# Final event
await websocket.send_text(json.dumps({
"data": {
"audio_bytes": "",
"duration": None,
"request_finished": True
}
}))
except WebSocketDisconnect:
pass
except Exception as e:
await websocket.send_text(json.dumps({"error": str(e)}))