from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect from fastapi.responses import StreamingResponse, JSONResponse import outetts import io import json import base64 # Initialize the interface interface = outetts.Interface( config=outetts.ModelConfig.auto_config( model=outetts.Models.VERSION_1_0_SIZE_1B, # For llama.cpp backend #backend=outetts.Backend.LLAMACPP, #quantization=outetts.LlamaCppQuantization.FP16 # For transformers backend backend=outetts.Backend.HF, ) ) # Load the default speaker profile speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL") app = FastAPI() @app.get("/") def greet_json(): return {"Hello": "World!"} @app.websocket("/ws/tts") async def websocket_tts(websocket: WebSocket): await websocket.accept() try: while True: # Empfange Text-Chunk vom Client data = await websocket.receive_text() # Status: Warming up await websocket.send_text(json.dumps({"generation_status": "Warming up TTS model"})) output = interface.generate( config=outetts.GenerationConfig( text=data, generation_type=outetts.GenerationType.CHUNKED, speaker=speaker, sampler_config=outetts.SamplerConfig( temperature=0.4 ), ) ) # Status: Generating linguistic features await websocket.send_text(json.dumps({"generation_status": "Generating linguistic features"})) # Save to buffer audio_buffer = io.BytesIO() output.save(audio_buffer) audio_buffer.seek(0) chunk_size = 4096 while True: chunk = audio_buffer.read(chunk_size) if not chunk: break audio_b64 = base64.b64encode(chunk).decode("ascii") await websocket.send_text(json.dumps({ "data": { "audio_bytes": audio_b64, "duration": None, "request_finished": False } })) # Final event await websocket.send_text(json.dumps({ "data": { "audio_bytes": "", "duration": None, "request_finished": True } })) except WebSocketDisconnect: pass except Exception as e: await websocket.send_text(json.dumps({"error": str(e)}))