Spaces:
Paused
Paused
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect | |
from fastapi.responses import StreamingResponse, JSONResponse | |
import outetts | |
import io | |
import json | |
import base64 | |
# Initialize the interface | |
interface = outetts.Interface( | |
config=outetts.ModelConfig.auto_config( | |
model=outetts.Models.VERSION_1_0_SIZE_1B, | |
# For llama.cpp backend | |
#backend=outetts.Backend.LLAMACPP, | |
#quantization=outetts.LlamaCppQuantization.FP16 | |
# For transformers backend | |
backend=outetts.Backend.HF, | |
) | |
) | |
# Load the default speaker profile | |
speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL") | |
app = FastAPI() | |
def greet_json(): | |
return {"Hello": "World!"} | |
async def websocket_tts(websocket: WebSocket): | |
await websocket.accept() | |
try: | |
while True: | |
# Empfange Text-Chunk vom Client | |
data = await websocket.receive_text() | |
# Status: Warming up | |
await websocket.send_text(json.dumps({"generation_status": "Warming up TTS model"})) | |
output = interface.generate( | |
config=outetts.GenerationConfig( | |
text=data, | |
generation_type=outetts.GenerationType.CHUNKED, | |
speaker=speaker, | |
sampler_config=outetts.SamplerConfig( | |
temperature=0.4 | |
), | |
) | |
) | |
# Status: Generating linguistic features | |
await websocket.send_text(json.dumps({"generation_status": "Generating linguistic features"})) | |
# Stream audio chunks | |
for chunk in output.stream(chunk_size=4096): | |
audio_b64 = base64.b64encode(chunk).decode("ascii") | |
await websocket.send_text(json.dumps({ | |
"data": { | |
"audio_bytes": audio_b64, | |
"duration": None, | |
"request_finished": False | |
} | |
})) | |
# Final event | |
await websocket.send_text(json.dumps({ | |
"data": { | |
"audio_bytes": "", | |
"duration": None, | |
"request_finished": True | |
} | |
})) | |
except WebSocketDisconnect: | |
pass | |
except Exception as e: | |
await websocket.send_text(json.dumps({"error": str(e)})) |