File size: 4,584 Bytes
60a351e
ac6dc4e
ad94d02
323d1a0
ac6dc4e
a9f2d93
1c9d2ef
9d43218
ad94d02
 
 
 
14c5384
a9f2d93
 
14c5384
 
ad94d02
 
 
 
e93f063
ad94d02
 
 
 
 
 
323d1a0
69f55df
 
 
 
 
a9f2d93
69f55df
a9f2d93
 
 
 
 
 
 
 
 
 
 
 
 
 
c060e06
9d43218
 
 
c060e06
9d43218
 
1c9d2ef
 
 
 
 
 
 
 
 
 
d0fc554
1c9d2ef
d0fc554
 
1c9d2ef
 
 
 
 
 
 
 
 
 
 
 
 
 
9d43218
 
 
 
 
 
 
 
 
 
1c9d2ef
9d43218
 
 
 
 
a9f2d93
 
 
 
 
 
 
 
69f55df
a9f2d93
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
from fastapi import FastAPI, Request, WebSocket, WebSocketDisconnect
from fastapi.responses import StreamingResponse, JSONResponse
import outetts
import io
import json
import base64
import struct
import os 
# Initialize the interface
interface = outetts.Interface(
    config=outetts.ModelConfig.auto_config(
        model=outetts.Models.VERSION_1_0_SIZE_1B,
        # For llama.cpp backend
        #backend=outetts.Backend.LLAMACPP,
        #quantization=outetts.LlamaCppQuantization.FP16
        # For transformers backend
        backend=outetts.Backend.HF,
    )
)

# Load the default speaker profile
speaker = interface.load_default_speaker("EN-FEMALE-1-NEUTRAL")

app = FastAPI()

@app.get("/")
def greet_json():
    return {"Hello": "World!"}

@app.websocket("/ws/tts")
async def websocket_tts(websocket: WebSocket):
    await websocket.accept()
    try:
        while True:
            # Empfange Text-Chunk vom Client
            data = await websocket.receive_text()
            # Status: Warming up
            await websocket.send_text(json.dumps({"generation_status": "Warming up TTS model"}))
            output = interface.generate(
                config=outetts.GenerationConfig(
                    text=data,
                    generation_type=outetts.GenerationType.CHUNKED,
                    speaker=speaker,
                    sampler_config=outetts.SamplerConfig(
                        temperature=0.4
                    ),
                )
            )
            # Status: Generating linguistic features
            await websocket.send_text(json.dumps({"generation_status": "Generating linguistic features"}))
            # Save to buffer
            import uuid
            temp_path = f"temp_{uuid.uuid4().hex}.wav"
            output.save(temp_path)
            chunk_size = 4096
            try:
                with open(temp_path, "rb") as f:
                    wav_data = f.read()
                    # WAV header is typically 44 bytes, but let's detect it robustly
                    # Find the end of the header (data chunk)
                    if wav_data[:4] != b'RIFF' or wav_data[8:12] != b'WAVE':
                        raise ValueError("Not a valid WAV file")
                    # Find 'data' subchunk
                    data_offset = wav_data.find(b'data')
                    if data_offset == -1:
                        raise ValueError("No 'data' chunk found in WAV file")
                    header_end = data_offset + 8  # 'data' + size (4 bytes)
                    wav_header = bytearray(wav_data[:header_end])
                    pcm_data = wav_data[header_end:]
                    # Patch header: set data length to 0xFFFFFFFF (unknown/streaming)
                    wav_header[data_offset+4:data_offset+8] = (0xFFFFFFFF).to_bytes(4, 'little')
                    # Send header + first PCM chunk
                    first_chunk = pcm_data[:chunk_size]
                    audio_b64 = base64.b64encode(wav_header + first_chunk).decode("ascii")
                    await websocket.send_text(json.dumps({
                        "data": {
                            "audio_bytes": audio_b64,
                            "duration": None,
                            "request_finished": False
                        }
                    }))
                    # Send rest of PCM data in chunks (without header)
                    idx = chunk_size
                    while idx < len(pcm_data):
                        chunk = pcm_data[idx:idx+chunk_size]
                        if not chunk:
                            break
                        audio_b64 = base64.b64encode(chunk).decode("ascii")
                        await websocket.send_text(json.dumps({
                            "data": {
                                "audio_bytes": audio_b64,
                                "duration": None,
                                "request_finished": False
                            }
                        }))
                        idx += chunk_size
            finally:
                try:
                    os.remove(temp_path)
                except FileNotFoundError:
                    pass
            # Final event
            await websocket.send_text(json.dumps({
                "data": {
                    "audio_bytes": "",
                    "duration": None,
                    "request_finished": True
                }
            }))
    except WebSocketDisconnect:
        pass
    except Exception as e:
        await websocket.send_text(json.dumps({"error": str(e)}))