from fastapi import FastAPI, Request from fastapi.responses import StreamingResponse, JSONResponse import outetts import io import json # Initialize the interface interface = outetts.Interface( config=outetts.ModelConfig.auto_config( model=outetts.Models.VERSION_1_0_SIZE_1B, # For llama.cpp backend #backend=outetts.Backend.LLAMACPP, #quantization=outetts.LlamaCppQuantization.FP16 # For transformers backend backend=outetts.Backend.HF, ) ) # Load the default speaker profile speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL") app = FastAPI() @app.get("/") def greet_json(): return {"Hello": "World!"} @app.post("/tts") async def tts_endpoint(request: Request): """ Accepts JSON {"text": "..."} and streams the generated audio as WAV. """ try: data = await request.json() text = data.get("text") if not text: return JSONResponse({"error": "Missing 'text' in request"}, status_code=400) # Generate audio from text output = interface.generate( config=outetts.GenerationConfig( text=text, generation_type=outetts.GenerationType.CHUNKED, speaker=speaker, sampler_config=outetts.SamplerConfig( temperature=0.4 ), ) ) audio_buffer = io.BytesIO() output.save(audio_buffer) audio_buffer.seek(0) def audio_stream(): yield audio_buffer.read() return StreamingResponse(audio_stream(), media_type="audio/wav") except Exception as e: return JSONResponse({"error": str(e)}, status_code=500) # WebSocket endpoint removed; use POST /tts for TTS requests.