Spaces:

fluxions
/

vui-space

Running on Zero

App Files Files Community

Harry Coultas Blum commited on Jun 5

Commit

88afac1

1 Parent(s): a2e6acf

INIT

Browse files

Files changed (15) hide show

app.py +385 -0
inference.py +12 -0
requirements.txt +17 -0
src/vui/__init__.py +1 -0
src/vui/config.py +41 -0
src/vui/fluac.py +707 -0
src/vui/inference.py +405 -0
src/vui/model.py +445 -0
src/vui/notebook.py +41 -0
src/vui/patterns.py +423 -0
src/vui/rope.py +54 -0
src/vui/sampling.py +43 -0
src/vui/tok.py +19 -0
src/vui/utils.py +422 -0
src/vui/vad.py +363 -0

app.py ADDED Viewed

	@@ -0,0 +1,385 @@

+import time
+import gradio as gr
+import torch
+from vui.inference import render
+from vui.model import Vui
+def get_available_models():
+    """Extract all CAPs static variables from Vui class that end with .pt"""
+    models = {}
+    for attr_name in dir(Vui):
+        if attr_name.isupper():
+            attr_value = getattr(Vui, attr_name)
+            if isinstance(attr_value, str) and attr_value.endswith(".pt"):
+                models[attr_name] = attr_value
+    return models
+AVAILABLE_MODELS = get_available_models()
+print(f"Available models: {list(AVAILABLE_MODELS.keys())}")
+current_model = None
+current_model_name = None
+def load_and_warm_model(model_name):
+    """Load and warm up a specific model"""
+    global current_model, current_model_name
+    if current_model_name == model_name and current_model is not None:
+        print(f"Model {model_name} already loaded and warmed up!")
+        return current_model
+    print(f"Loading model {model_name}...")
+    model_path = AVAILABLE_MODELS[model_name]
+    model = Vui.from_pretrained_inf(model_path).cuda()
+    print(f"Compiling model {model_name}...")
+    model.decoder = torch.compile(model.decoder, fullgraph=True)
+    print(f"Warming up model {model_name}...")
+    warmup_text = "Hello, this is a test. Let's say some random shizz"
+    render(
+        model,
+        warmup_text,
+        max_secs=10,
+    )
+    current_model = model
+    current_model_name = model_name
+    print(f"Model {model_name} loaded and warmed up successfully!")
+    return model
+# Load default model (COHOST)
+default_model = (
+    "COHOST" if "COHOST" in AVAILABLE_MODELS else list(AVAILABLE_MODELS.keys())[0]
+)
+model = load_and_warm_model(default_model)
+# Preload sample 1 (index 0) with current model
+print("Preloading sample 1...")
+sample_1_text = """Welcome to Fluxions, the podcast where... we uh explore how technology is shaping the world around us. I'm your host, Alex.
+[breath] And I'm Jamie um [laugh] today, we're diving into a [hesitate] topic that's transforming customer service uh voice technology for agents.
+That's right. We're [hesitate] talking about the AI-driven tools that are making those long, frustrating customer service calls a little more bearable, for both the customer and the agents."""
+sample_1_audio = render(
+    current_model,
+    sample_1_text,
+)
+sample_1_audio = sample_1_audio.cpu()
+sample_1_audio = sample_1_audio[..., :-2000]  # Trim end artifacts
+preloaded_sample_1 = (model.codec.config.sample_rate, sample_1_audio.flatten().numpy())
+print("Sample 1 preloaded successfully!")
+print("Models ready for inference!")
+# Sample texts for quick testing - keeping original examples intact
+SAMPLE_TEXTS = [
+    """Welcome to Fluxions, the podcast where... we uh explore how technology is shaping the world around us. I'm your host, Alex.
+[breath] And I'm Jamie um [laugh] today, we're diving into a [hesitate] topic that's transforming customer service uh voice technology for agents.
+That's right. We're [hesitate] talking about the AI-driven tools that are making those long, frustrating customer service calls a little more bearable, for both the customer and the agents.""",
+    """Um, hey Sarah, so I just left the meeting with the, uh, rabbit focus group and they are absolutely loving the new heritage carrots! Like, I've never seen such enthusiastic thumping in my life! The purple ones are testing through the roof - apparently the flavor profile is just amazing - and they're willing to pay a premium for them! We need to, like, triple production on those immediately and maybe consider a subscription model? Anyway, gotta go, but let's touch base tomorrow about scaling this before the Easter rush hits!""",
+    """What an absolute joke, like I'm really not enjoying this situation where I'm just forced to say things.""",
+    """ So [breath] I don't know if you've been there [breath] but I'm really pissed off.
+Oh no! Why, what happened?
+Well I went to this cafe hearth, and they gave me the worst toastie I've ever had, it didn't come with salad it was just raw.
+Well that's awful what kind of toastie was it?
+It was supposed to be a chicken bacon lettuce tomatoe, but it was fucking shite, like really bad and I honestly would have preferred to eat my own shit.
+[laugh] well, it must have been awful for you, I'm sorry to hear that, why don't we move on to brighter topics, like the good old weather?""",
+]
+def text_to_speech(text, temperature=0.5, top_k=100, top_p=None, max_duration=60):
+    """
+    Convert text to speech using the current Vui model
+    Args:
+        text (str): Input text to convert to speech
+        temperature (float): Sampling temperature (0.1-1.0)
+        top_k (int): Top-k sampling parameter
+        top_p (float): Top-p sampling parameter (None to disable)
+        max_duration (int): Maximum audio duration in seconds
+    Returns:
+        tuple: (sample_rate, audio_array) for Gradio audio output
+    """
+    if not text.strip():
+        return None, "Please enter some text to convert to speech."
+    if current_model is None:
+        return None, "No model loaded. Please select a model first."
+    print(f"Generating speech for: {text[:50]}... using model {current_model_name}")
+    # Generate speech using render
+    t1 = time.perf_counter()
+    result = render(
+        current_model,
+        text.strip(),
+        temperature=temperature,
+        top_k=top_k,
+        top_p=top_p,
+        max_secs=max_duration,
+    )
+    # Long text: render returns (codes, text, audio) tuple
+    waveform = result
+    # waveform is already decoded audio from generate_infinite
+    waveform = waveform.cpu()
+    sr = current_model.codec.config.sample_rate
+    # Calculate generation speed
+    generation_time = time.perf_counter() - t1
+    audio_duration = waveform.shape[-1] / sr
+    speed_factor = audio_duration / generation_time
+    # Trim end artifacts if needed
+    if waveform.shape[-1] > 2000:
+        waveform = waveform[..., :-2000]
+    # Convert to numpy array for Gradio
+    audio_array = waveform.flatten().numpy()
+    info = f"Generated {audio_duration:.1f}s of audio in {generation_time:.1f}s ({speed_factor:.1f}x realtime) with {current_model_name}"
+    print(info)
+    return (sr, audio_array), info
+def change_model(model_name):
+    """Change the active model and return status"""
+    try:
+        load_and_warm_model(model_name)
+        return f"Successfully loaded and warmed up model: {model_name}"
+    except Exception as e:
+        return f"Error loading model {model_name}: {str(e)}"
+def load_sample_text(sample_index):
+    """Load a sample text for quick testing"""
+    if 0 <= sample_index < len(SAMPLE_TEXTS):
+        return SAMPLE_TEXTS[sample_index]
+    return ""
+# Create Gradio interface
+with gr.Blocks(
+    title="Vui",
+    theme=gr.themes.Soft(),
+    head="""
+<script>
+document.addEventListener('DOMContentLoaded', function() {
+    // Add keyboard shortcuts
+    document.addEventListener('keydown', function(e) {
+        // Ctrl/Cmd + Enter to generate (but not when Shift is pressed)
+        if ((e.ctrlKey) && e.key === 'Enter' && !e.shiftKey) {
+            e.preventDefault();
+            const generateBtn = document.querySelector('button[variant="primary"]');
+            if (generateBtn && !generateBtn.disabled) {
+                generateBtn.click();
+            }
+        }
+        else if ((e.ctrlKey) && e.code === 'Space') {
+            e.preventDefault();
+            const audioElement = document.querySelector('audio');
+            if (audioElement) {
+                if (audioElement.paused) {
+                    audioElement.play();
+                } else {
+                    audioElement.pause();
+                }
+            }
+        }
+    });
+    // Auto-play audio when it's updated
+    const observer = new MutationObserver(function(mutations) {
+        mutations.forEach(function(mutation) {
+            if (mutation.type === 'childList') {
+                const audioElements = document.querySelectorAll('audio');
+                audioElements.forEach(function(audio) {
+                    if (audio.src && !audio.dataset.hasAutoplayListener) {
+                        audio.dataset.hasAutoplayListener = 'true';
+                        audio.addEventListener('loadeddata', function() {
+                            // Small delay to ensure audio is ready
+                            setTimeout(() => {
+                                audio.play().catch(e => {
+                                    console.log('Autoplay prevented by browser:', e);
+                                });
+                            }, 100);
+                        });
+                    }
+                });
+            }
+        });
+    });
+    observer.observe(document.body, {
+        childList: true,
+        subtree: true
+    });
+});
+</script>
+""",
+) as demo:
+    gr.Markdown(
+        "**Keyboard Shortcuts:** `Ctrl + Enter` to generate` or Ctrl + Space to pause"
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Model selector
+            model_dropdown = gr.Dropdown(
+                choices=list(AVAILABLE_MODELS.keys()),
+                value=default_model,
+                label=None,
+                info="Select a voice model",
+            )
+            # Model status
+            model_status = gr.Textbox(
+                label=None,
+                value=f"Model {default_model} loaded and ready",
+                interactive=False,
+                lines=1,
+            )
+            # Text input
+            text_input = gr.Textbox(
+                label=None,
+                placeholder="Enter the text you want to convert to speech...",
+                lines=5,
+                max_lines=10,
+            )
+        with gr.Column(scale=1):
+            # Audio output with autoplay
+            audio_output = gr.Audio(
+                label="Generated Speech", type="numpy", autoplay=True  # Enable autoplay
+            )
+            # Info output
+            info_output = gr.Textbox(
+                label="Generation Info", lines=3, interactive=False
+            )
+    with gr.Row():
+        with gr.Column(scale=2):
+            # Sample text buttons
+            gr.Markdown("**Quick samples:**")
+            with gr.Row():
+                sample_btns = []
+                for i, sample in enumerate(SAMPLE_TEXTS):
+                    btn = gr.Button(f"Sample {i+1}", size="sm")
+                    if i == 0:  # Sample 1 (index 0) - use preloaded audio
+                        def load_preloaded_sample_1():
+                            return (
+                                SAMPLE_TEXTS[0],
+                                preloaded_sample_1,
+                                "Preloaded sample 1 audio",
+                            )
+                        btn.click(
+                            fn=load_preloaded_sample_1,
+                            outputs=[text_input, audio_output, info_output],
+                        )
+                    else:
+                        btn.click(
+                            fn=lambda idx=i: SAMPLE_TEXTS[idx], outputs=text_input
+                        )
+            # Generation parameters
+            with gr.Accordion("Advanced Settings", open=False):
+                temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.5,
+                    step=0.1,
+                    label="Temperature",
+                    info="Higher values = more varied speech",
+                )
+                top_k = gr.Slider(
+                    minimum=1,
+                    maximum=200,
+                    value=100,
+                    step=1,
+                    label="Top-K",
+                    info="Number of top tokens to consider",
+                )
+                use_top_p = gr.Checkbox(label="Use Top-P sampling", value=False)
+                top_p = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.0,
+                    value=0.9,
+                    step=0.05,
+                    label="Top-P",
+                    info="Cumulative probability threshold",
+                    visible=False,
+                )
+                max_duration = gr.Slider(
+                    minimum=5,
+                    maximum=120,
+                    value=60,
+                    step=5,
+                    label="Max Duration (seconds)",
+                    info="Maximum length of generated audio",
+                )
+                # Show/hide top_p based on checkbox
+                use_top_p.change(
+                    fn=lambda x: gr.update(visible=x), inputs=use_top_p, outputs=top_p
+                )
+            # Generate button
+            generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg")
+    # Examples section
+    gr.Markdown("## 📝 Example Texts")
+    with gr.Accordion("View example texts", open=False):
+        for i, sample in enumerate(SAMPLE_TEXTS):
+            gr.Markdown(f"**Sample {i+1}:** {sample}")
+    # Connect the model change function
+    model_dropdown.change(fn=change_model, inputs=model_dropdown, outputs=model_status)
+    # Connect the generate function
+    def generate_wrapper(text, temp, k, use_p, p, duration):
+        top_p_val = p if use_p else None
+        return text_to_speech(text, temp, k, top_p_val, duration)
+    generate_btn.click(
+        fn=generate_wrapper,
+        inputs=[text_input, temperature, top_k, use_top_p, top_p, max_duration],
+        outputs=[audio_output, info_output],
+    )
+    # Also allow Enter key to generate
+    text_input.submit(
+        fn=generate_wrapper,
+        inputs=[text_input, temperature, top_k, use_top_p, top_p, max_duration],
+        outputs=[audio_output, info_output],
+    )
+    # Auto-load sample 1 on startup
+    demo.load(
+        fn=lambda: (
+            SAMPLE_TEXTS[0],
+            preloaded_sample_1,
+            "Sample 1 preloaded and ready!",
+        ),
+        outputs=[text_input, audio_output, info_output],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", share=True)

inference.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import torchaudio
+from vui.inference import render
+from vui.model import Vui
+model = Vui.from_pretrained().cuda()
+waveform = render(
+    model,
+    "Hey, here is some random stuff, usually something quite long as the shorter the text the less likely the model can cope!",
+)
+print(waveform.shape)
+torchaudio.save("out.opus", waveform[0], 22050)

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+einops
+huggingface_hub[hf_transfer]
+inflect
+gradio
+numba
+numpy
+openai-whisper
+feedparser
+pydantic
+pyannote.audio
+soundfile
+sphn
+tiktoken
+torch
+torchaudio
+tqdm
+transformers

src/vui/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.1.0"

src/vui/config.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import sys
+from pydantic import BaseModel
+class VuiConfig(BaseModel):
+    max_text_tokens: int = 100
+    text_size: int = -1
+    max_audio_tokens: int = 100
+    n_quantizers: int = 9
+    codebook_size: int = 1000
+    special_token_id: int = 1000
+    audio_eos_id: int = 1000 + 1
+    audio_pad_id: int = 1000 + 1 + 1
+    d_model: int = 512
+    n_layers: int = 6
+    n_heads: int = 8
+    bias: bool = False
+    dropout: float = 0.0
+    use_rotary_emb: bool = True
+    rope_dim: int | None = None
+    rope_theta: float = 10_000.0
+    rope_theta_rescale_factor: float = 1.0
+class Config(BaseModel):
+    name: str = "base"
+    checkpoint: str | dict | None = None
+    model: VuiConfig = VuiConfig()
+ALL = []
+current_module = sys.modules[__name__]
+for name in dir(current_module):
+    if name.isupper() and isinstance(getattr(current_module, name), Config):
+        ALL.append(getattr(current_module, name))
+CONFIGS = {v.name: v for v in ALL}

src/vui/fluac.py ADDED Viewed

	@@ -0,0 +1,707 @@

+import math
+from contextlib import nullcontext
+from functools import partial, wraps
+from os import path
+from typing import List, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import pack, rearrange, unpack
+from einops.layers.torch import Rearrange
+from pydantic import BaseModel
+from torch import Tensor, int32
+from torch.amp import autocast
+from torch.nn import Module
+from torch.nn.utils.parametrizations import weight_norm
+from vui.utils import decompile_state_dict
+def exists(v):
+    return v is not None
+def default(*args):
+    for arg in args:
+        if exists(arg):
+            return arg
+    return None
+def maybe(fn):
+    @wraps(fn)
+    def inner(x, *args, **kwargs):
+        if not exists(x):
+            return x
+        return fn(x, *args, **kwargs)
+    return inner
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def round_ste(z: Tensor) -> Tensor:
+    """Round with straight through gradients."""
+    zhat = z.round()
+    return z + (zhat - z).detach()
+class FSQ(Module):
+    def __init__(
+        self,
+        levels: List[int],
+        dim: int | None = None,
+        num_codebooks: int = 1,
+        keep_num_codebooks_dim: bool | None = None,
+        allowed_dtypes: Tuple[torch.dtype, ...] = (torch.float32, torch.float64),
+        channel_first: bool = True,
+        projection_has_bias: bool = True,
+        return_indices=True,
+        force_quantization_f32: bool = True,
+    ):
+        super().__init__()
+        _levels = torch.tensor(levels, dtype=int32)
+        self.register_buffer("_levels", _levels, persistent=False)
+        _basis = torch.cumprod(torch.tensor([1] + levels[:-1]), dim=0, dtype=int32)
+        self.register_buffer("_basis", _basis, persistent=False)
+        codebook_dim = len(levels)
+        self.codebook_dim = codebook_dim
+        effective_codebook_dim = codebook_dim * num_codebooks
+        self.num_codebooks = num_codebooks
+        self.effective_codebook_dim = effective_codebook_dim
+        keep_num_codebooks_dim = default(keep_num_codebooks_dim, num_codebooks > 1)
+        assert not (num_codebooks > 1 and not keep_num_codebooks_dim)
+        self.keep_num_codebooks_dim = keep_num_codebooks_dim
+        self.dim = default(dim, len(_levels) * num_codebooks)
+        self.channel_first = channel_first
+        has_projections = self.dim != effective_codebook_dim
+        self.project_in = (
+            nn.Linear(self.dim, effective_codebook_dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.project_out = (
+            nn.Linear(effective_codebook_dim, self.dim, bias=projection_has_bias)
+            if has_projections
+            else nn.Identity()
+        )
+        self.has_projections = has_projections
+        self.return_indices = return_indices
+        if return_indices:
+            self.codebook_size = self._levels.prod().item()
+            implicit_codebook = self._indices_to_codes(torch.arange(self.codebook_size))
+            self.register_buffer(
+                "implicit_codebook", implicit_codebook, persistent=False
+            )
+        self.allowed_dtypes = allowed_dtypes
+        self.force_quantization_f32 = force_quantization_f32
+    def bound(self, z, eps: float = 1e-3):
+        """Bound `z`, an array of shape (..., d)."""
+        half_l = (self._levels - 1) * (1 + eps) / 2
+        offset = torch.where(self._levels % 2 == 0, 0.5, 0.0)
+        shift = (offset / half_l).atanh()
+        return (z + shift).tanh() * half_l - offset
+    def quantize(self, z):
+        """Quantizes z, returns quantized zhat, same shape as z."""
+        quantized = round_ste(self.bound(z))
+        half_width = self._levels // 2  # Renormalize to [-1, 1].
+        return quantized / half_width
+    def _scale_and_shift(self, zhat_normalized):
+        half_width = self._levels // 2
+        return (zhat_normalized * half_width) + half_width
+    def _scale_and_shift_inverse(self, zhat):
+        half_width = self._levels // 2
+        return (zhat - half_width) / half_width
+    def _indices_to_codes(self, indices):
+        level_indices = self.indices_to_level_indices(indices)
+        codes = self._scale_and_shift_inverse(level_indices)
+        return codes
+    def codes_to_indices(self, zhat):
+        """Converts a `code` to an index in the codebook."""
+        assert zhat.shape[-1] == self.codebook_dim
+        zhat = self._scale_and_shift(zhat)
+        return (zhat * self._basis).sum(dim=-1).to(int32)
+    def indices_to_level_indices(self, indices):
+        """Converts indices to indices at each level, perhaps needed for a transformer with factorized embeddings"""
+        indices = rearrange(indices, "... -> ... 1")
+        codes_non_centered = (indices // self._basis) % self._levels
+        return codes_non_centered
+    def indices_to_codes(self, indices):
+        """Inverse of `codes_to_indices`."""
+        assert exists(indices)
+        is_img_or_video = indices.ndim >= (3 + int(self.keep_num_codebooks_dim))
+        codes = self._indices_to_codes(indices)
+        if self.keep_num_codebooks_dim:
+            codes = rearrange(codes, "... c d -> ... (c d)")
+        codes = self.project_out(codes)
+        if is_img_or_video or self.channel_first:
+            codes = rearrange(codes, "b ... d -> b d ...")
+        return codes
+    def forward(self, z: Tensor):
+        """
+        einstein notation
+        b - batch
+        n - sequence (or flattened spatial dimensions)
+        d - feature dimension
+        c - number of codebook dim
+        """
+        device_type = z.device.type
+        with torch.autocast(device_type=device_type, enabled=False):
+            if self.channel_first:
+                z = rearrange(z, "b d ... -> b ... d")
+                z, ps = pack_one(z, "b * d")
+            assert (
+                z.shape[-1] == self.dim
+            ), f"expected dimension of {self.dim} but found dimension of {z.shape[-1]}"
+            z = self.project_in(z)
+            z = rearrange(z, "b n (c d) -> b n c d", c=self.num_codebooks)
+            # whether to force quantization step to be full precision or not
+            force_f32 = self.force_quantization_f32
+            quantization_context = (
+                partial(autocast, device_type=device_type, enabled=False)
+                if force_f32
+                else nullcontext
+            )
+            with quantization_context():
+                orig_dtype = z.dtype
+                if force_f32 and orig_dtype not in self.allowed_dtypes:
+                    z = z.float()
+                codes = self.quantize(z)
+                # returning indices could be optional
+                indices = None
+                if self.return_indices:
+                    indices = self.codes_to_indices(codes)
+                codes = rearrange(codes, "b n c d -> b n (c d)")
+                codes = codes.type(orig_dtype)
+            # project out
+            out = self.project_out(codes)
+            # reconstitute image or video dimensions
+            if self.channel_first:
+                out = unpack_one(out, ps, "b * d")
+                out = rearrange(out, "b ... d -> b d ...")
+                indices = maybe(unpack_one)(indices, ps, "b * c")
+            if not self.keep_num_codebooks_dim and self.return_indices:
+                indices = maybe(rearrange)(indices, "... 1 -> ...")
+            # return quantized output and indices
+            return out, indices
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+# Scripting this brings model speed up 1.4x
+@torch.jit.script
+def snake(x, alpha):
+    shape = x.shape
+    x = x.reshape(shape[0], shape[1], -1)
+    x = x + (alpha + 1e-9).reciprocal() * torch.sin(alpha * x).pow(2)
+    x = x.reshape(shape)
+    return x
+class Snake1d(nn.Module):
+    def __init__(self, channels):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, channels, 1))
+    def forward(self, x):
+        return snake(x, self.alpha)
+def init_weights(m):
+    if isinstance(m, nn.Conv1d):
+        nn.init.trunc_normal_(m.weight, std=0.02)
+        nn.init.constant_(m.bias, 0)
+class ResidualUnit(nn.Module):
+    def __init__(self, dim: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+        self.block = nn.Sequential(
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=7, dilation=dilation, padding=pad),
+            Snake1d(dim),
+            WNConv1d(dim, dim, kernel_size=1),
+        )
+    def forward(self, x):
+        y = self.block(x)
+        pad = (x.shape[-1] - y.shape[-1]) // 2
+        if pad > 0:
+            x = x[..., pad:-pad]
+        return x + y
+class EncoderBlock(nn.Module):
+    def __init__(self, dim: int = 16, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            ResidualUnit(dim // 2, dilation=1),
+            ResidualUnit(dim // 2, dilation=3),
+            ResidualUnit(dim // 2, dilation=9),
+            Snake1d(dim // 2),
+            WNConv1d(
+                dim // 2,
+                dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        d_model: int = 64,
+        strides: list = [2, 4, 8, 8],
+        d_latent: int = 64,
+    ):
+        super().__init__()
+        # Create first convolution
+        self.block = [WNConv1d(1, d_model, kernel_size=7, padding=3)]
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride in strides:
+            d_model *= 2
+            self.block += [EncoderBlock(d_model, stride=stride)]
+        # Create last convolution
+        self.block += [
+            Snake1d(d_model),
+            WNConv1d(d_model, d_latent, kernel_size=3, padding=1),
+        ]
+        # Wrap black into nn.Sequential
+        self.block = nn.Sequential(*self.block)
+        self.enc_dim = d_model
+    def forward(self, x):
+        return self.block(x)
+class DecoderBlock(nn.Module):
+    def __init__(self, input_dim: int = 16, output_dim: int = 8, stride: int = 1):
+        super().__init__()
+        self.block = nn.Sequential(
+            Snake1d(input_dim),
+            WNConvTranspose1d(
+                input_dim,
+                output_dim,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2),
+            ),
+            ResidualUnit(output_dim, dilation=1),
+            ResidualUnit(output_dim, dilation=3),
+            ResidualUnit(output_dim, dilation=9),
+        )
+    def forward(self, x):
+        return self.block(x)
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        input_channel: int,
+        channels: int,
+        rates: list[int],
+        d_out: int = 1,
+    ):
+        super().__init__()
+        # Add first conv layer
+        layers = [WNConv1d(input_channel, channels, kernel_size=7, padding=3)]
+        # Add upsampling + MRF blocks
+        for i, stride in enumerate(rates):
+            input_dim = channels // 2**i
+            output_dim = channels // 2 ** (i + 1)
+            layers += [DecoderBlock(input_dim, output_dim, stride)]
+        # Add final conv layer
+        layers += [
+            Snake1d(output_dim),
+            WNConv1d(output_dim, d_out, kernel_size=7, padding=3),
+            nn.Tanh(),
+        ]
+        self.model = nn.Sequential(*layers)
+    # @torch.compile(dynamic=True)
+    def forward(self, z: Tensor):
+        return self.model(z)
+class FiniteScalarQuantize(nn.Module):
+    def __init__(
+        self, latent_dim: int, levels: list[int], *, stride: int = 1, mlp: bool = False
+    ):
+        super().__init__()
+        self.stride = stride
+        codebook_dim = len(levels)
+        self.in_proj = WNConv1d(latent_dim, codebook_dim, kernel_size=1)
+        self.quantize = FSQ(levels=levels, channel_first=True)
+        self.out_proj = WNConv1d(codebook_dim, latent_dim, kernel_size=1)
+        if mlp:
+            self.mlp = nn.Sequential(
+                Rearrange("B C T -> B T C"),
+                nn.Linear(latent_dim, 4 * latent_dim),
+                nn.GELU(),
+                nn.Linear(4 * latent_dim, latent_dim),
+                Rearrange("B T C -> B C T"),
+            )
+        else:
+            self.mlp = None
+    def from_indices(self, indices: Tensor):
+        B, T = indices.size()
+        z_q = self.quantize.indices_to_codes(indices)
+        z_q = self.out_proj(z_q)
+        return z_q
+    def forward(self, z: Tensor, *args):
+        if self.stride > 1:
+            z = F.avg_pool1d(z, self.stride, stride=self.stride)
+        z_e = self.in_proj(z)  # z_e : (B x D x T)
+        # we're channels first
+        # scale = scale.unsqueeze(-1)
+        # z_e = z_e / scale
+        z_q, indices = self.quantize(z_e)
+        # z_q = z_q * scale
+        z_q = self.out_proj(z_q)
+        if self.stride > 1:
+            z_e = z_e.repeat_interleave(self.stride, dim=-1)
+            z_q = z_q.repeat_interleave(self.stride, dim=-1)
+            indices = indices.repeat_interleave(self.stride, dim=-1)
+        if self.mlp is not None:
+            z_q = self.mlp(z_q)
+        return z_q, indices, z_e
+class ResidualFiniteScalarQuantize(nn.Module):
+    def __init__(
+        self,
+        *,
+        latent_dim: int,
+        n_quantizers: int,
+        levels: list[int],
+        strides: list[int] | None = None,
+        quantizer_dropout: float = 0.0,
+        mlp: bool = False,
+    ):
+        super().__init__()
+        self.n_quantizers = n_quantizers
+        self.quantizer_dropout = quantizer_dropout
+        strides = [1] * n_quantizers if strides is None else strides
+        assert (
+            len(strides) == n_quantizers
+        ), "Strides must be provided for each codebook"
+        scales = []
+        quantizers = []
+        levels_tensor = torch.tensor(levels, dtype=torch.float32)
+        for i in range(n_quantizers):
+            scales.append((levels_tensor - 1) ** -i)
+            quantizers.append(
+                FiniteScalarQuantize(
+                    latent_dim=latent_dim, levels=levels, stride=strides[i], mlp=mlp
+                )
+            )
+        self.quantizers = nn.ModuleList(quantizers)
+        self.register_buffer("scales", torch.stack(scales), persistent=False)
+        codebooks = [
+            quantizer.quantize.implicit_codebook for quantizer in self.quantizers
+        ]
+        self.codebooks = torch.stack(codebooks, dim=0)
+    def from_indices(self, indices: Tensor):
+        B, Q, T = indices.size()
+        z_q = 0.0
+        for i, quantizer in enumerate(self.quantizers):
+            z_q_i = quantizer.from_indices(indices[:, i])
+            z_q = z_q + z_q_i
+        return z_q
+    def forward(self, z: Tensor, n_quantizers: int | None = None):
+        """Quantized the input tensor using a fixed set of `n` codebooks and returns
+        the corresponding codebook vectors
+        Parameters
+        ----------
+        z : Tensor[B x D x T]
+        n_quantizers : int, optional
+            No. of quantizers to use
+            (n_quantizers < self.n_codebooks ex: for quantizer dropout)
+            Note: if `self.quantizer_dropout` is True, this argument is ignored
+                when in training mode, and a random number of quantizers is used.
+        Returns
+        -------
+        dict
+            A dictionary with the following keys:
+            "z" : Tensor[B x D x T]
+                Quantized continuous representation of input
+            "codes" : Tensor[B x N x T]
+                Codebook indices for each codebook
+                (quantized discrete representation of input)
+            "latents" : Tensor[B x N*D x T]
+                Projected latents (continuous representation of input before quantization)
+        """
+        B = z.shape[0]
+        z_q = 0
+        residual = z
+        indices = []
+        latents = []
+        if n_quantizers is None:
+            n_quantizers = self.n_quantizers
+        if self.training:
+            n_quantizers = torch.ones((B,)) * self.n_quantizers + 1
+            dropout = torch.randint(1, self.n_quantizers + 1, (B,))
+            n_dropout = int(B * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(z.device)
+        for i, quantizer in enumerate(self.quantizers):
+            if not self.training and i >= n_quantizers:
+                break
+            z_q_i, indices_i, z_e_i = quantizer(residual)
+            residual = residual - z_q_i.detach()
+            mask = torch.full((B,), fill_value=i, device=z.device) < n_quantizers
+            z_q = z_q + z_q_i * mask[:, None, None]
+            indices.append(indices_i)
+            latents.append(z_e_i)
+        indices = torch.stack(indices, dim=1)
+        latents = torch.cat(latents, dim=1)
+        return z_q, indices, latents
+class FluacConfig(BaseModel):
+    sample_rate: int = 44100
+    codebook_size: int | None = None
+    encoder_dim: int = 64
+    encoder_rates: list[int] = [2, 4, 8, 8]
+    quantizer_strides: list[int] | None = None  # SNAC style strides
+    n_quantizers: int = 1
+    fsq_levels: list[int] | None = [8, 5, 5, 5]  # 1000
+    decoder_dim: int = 1536
+    decoder_rates: list[int] = [8, 8, 4, 2]
+    @property
+    def hop_length(self) -> int:
+        return math.prod(self.encoder_rates)
+    @property
+    def latent_dim(self) -> int:
+        return self.encoder_dim * (2 ** len(self.encoder_rates))
+    @property
+    def effective_codebook_size(self) -> int:
+        return math.prod(self.fsq_levels)
+class Fluac(nn.Module):
+    Q9_22KHZ = "fluac-22hz-22khz.pt"
+    def __init__(self, config: FluacConfig):
+        super().__init__()
+        self.config = config
+        self.encoder = Encoder(
+            config.encoder_dim, config.encoder_rates, config.latent_dim
+        )
+        self.quantizer = ResidualFiniteScalarQuantize(
+            latent_dim=config.latent_dim,
+            n_quantizers=config.n_quantizers,
+            levels=config.fsq_levels,
+            strides=config.quantizer_strides,
+        )
+        self.decoder = Decoder(
+            config.latent_dim,
+            config.decoder_dim,
+            config.decoder_rates,
+        )
+        self.apply(init_weights)
+    @staticmethod
+    def from_pretrained(name: str = Q9_22KHZ):
+        if path.exists(name):
+            checkpoint_path = name
+        else:
+            from huggingface_hub import hf_hub_download
+            checkpoint_path = hf_hub_download(
+                "fluxions/vui",
+                name,
+            )
+        checkpoint = torch.load(checkpoint_path, weights_only=True, map_location="cpu")
+        config = checkpoint["config"]
+        if "model" in config:
+            model_config = FluacConfig(**config["model"])
+        else:
+            model_config = FluacConfig(**config)
+        generator = Fluac(model_config).eval()
+        ckpt = decompile_state_dict(checkpoint["generator"])
+        generator.load_state_dict(ckpt)
+        return generator
+    def pad(self, waveform: Tensor):
+        T = waveform.size(-1)
+        right_pad = math.ceil(T / self.config.hop_length) * self.config.hop_length - T
+        waveform = F.pad(waveform, (0, right_pad))
+        return waveform
+    @torch.inference_mode()
+    def from_indices(self, indices: Tensor):
+        z_q = self.quantizer.from_indices(indices)
+        waveform = self.decoder(z_q)
+        return waveform
+    @torch.inference_mode()
+    def encode(self, waveforms: Tensor, n_quantizers: int | None = None):
+        # Ensure that waveforms is 3 dima
+        waveforms = waveforms.flatten()[None][None]
+        waveforms = self.pad(waveforms)
+        B, C, T = waveforms.size()
+        z = self.encoder(waveforms)
+        z_q, codes, latents = self.quantizer(z, n_quantizers=n_quantizers)
+        return codes
+    def forward(self, waveforms: Tensor, n_quantizers: int | None = None):
+        B, C, T = waveforms.size()
+        waveforms = self.pad(waveforms)
+        z = self.encoder(waveforms)
+        z_q, codes, latents = self.quantizer(z, n_quantizers=n_quantizers)
+        recons = self.decoder(z_q)
+        recons = recons[..., :T]
+        return {
+            "recons": recons,
+            "codes": codes,
+        }
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype
+    @property
+    def hz(self):
+        import numpy as np
+        return self.config.sample_rate / np.prod(self.config.encoder_rates).item()
+if __name__ == "__main__":
+    codec = Fluac.from_pretrained(Fluac.Q9_22KHZ)
+    print(codec.config)
+    wav = torch.rand(1, 1, 22050)
+    wav = codec.pad(wav)
+    codes = codec.encode(wav)
+    breakpoint()

src/vui/inference.py ADDED Viewed

	@@ -0,0 +1,405 @@

+import re
+import time
+import inflect
+import torch
+import torch.nn.functional as F
+import torchaudio
+from torch import Tensor
+from torch.nn.attention import SDPBackend, sdpa_kernel
+from vui.model import Vui
+from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
+from vui.utils import timer
+from vui.vad import detect_voice_activity as vad
+def ensure_spaces_around_tags(text: str):
+    # Add space before '[' if not preceded by space, '<', or '['
+    text = re.sub(
+        r"(?<![<\[\s])(\[)",
+        lambda m: (
+            f"\n{m.group(1)}"
+            if m.start() > 0 and text[m.start() - 1] == "\n"
+            else f" {m.group(1)}"
+        ),
+        text,
+    )
+    # Add space after ']' if not preceded by digit+']' and not followed by space, '>', or ']'
+    text = re.sub(
+        r"(?<!\d\])(\])(?![>\]\s])",
+        lambda m: (
+            f"{m.group(1)}\n"
+            if m.end() < len(text) and text[m.end()] == "\n"
+            else f"{m.group(1)} "
+        ),
+        text,
+    )
+    text = text.strip()
+    return text
+REPLACE = [
+    ("—", ","),
+    ("'", "'"),
+    (":", ","),
+    (";", ","),
+]
+engine = None
+wm = None
+def asr(chunk, model=None, prefix=None):
+    import whisper
+    global wm
+    if model is not None:
+        wm = model
+    elif wm is None:
+        wm = whisper.load_model("turbo", "cuda")
+    """Process audio with VAD and transcribe"""
+    chunk = whisper.pad_or_trim(chunk)
+    mel = whisper.log_mel_spectrogram(chunk, n_mels=wm.dims.n_mels).to(wm.device)
+    options = whisper.DecodingOptions(
+        language="en", without_timestamps=True, prefix=prefix
+    )
+    result = whisper.decode(wm, mel[None], options)
+    return result[0].text
+def replace_numbers_with_words(text):
+    global engine
+    if engine is None:
+        engine = inflect.engine()
+    # Function to convert a number match to words
+    def number_to_words(match):
+        number = match.group()
+        return engine.number_to_words(number) + " "
+    # Replace digits with their word equivalents
+    return re.sub(r"\d+", number_to_words, text)
+valid_non_speech = ["breath", "sigh", "laugh", "tut", "hesitate"]
+valid_non_speech = [f"[{v}]" for v in valid_non_speech]
+def remove_all_invalid_non_speech(txt):
+    """
+    Remove all non-speech markers that are not in the valid_non_speech list.
+    Only keeps valid non-speech markers like [breath], [sigh], etc.
+    """
+    # Find all text within square brackets
+    bracket_pattern = r"\[([^\]]+)\]"
+    brackets = re.findall(bracket_pattern, txt)
+    # For each bracketed text, check if it's in our valid list
+    for bracket in brackets:
+        bracket_with_brackets = f"[{bracket}]"
+        if bracket_with_brackets not in valid_non_speech and bracket != "pause":
+            # If not valid, remove it from the text
+            txt = txt.replace(bracket_with_brackets, "")
+    return txt
+def simple_clean(text):
+    text = re.sub(r"(\d+)am", r"\1 AM", text)
+    text = re.sub(r"(\d+)pm", r"\1 PM", text)
+    text = replace_numbers_with_words(text)
+    text = ensure_spaces_around_tags(text)
+    text = remove_all_invalid_non_speech(text)
+    text = text.replace('"', "")
+    text = text.replace("”", "")
+    text = text.replace("“", "")
+    text = text.replace("’", "'")
+    text = text.replace("%", " percent")
+    text = text.replace("*", "")
+    text = text.replace("(", "")
+    text = text.replace(")", "")
+    text = text.replace(";", "")
+    text = text.replace("–", " ")
+    text = text.replace("—", "")
+    text = text.replace(":", "")
+    text = text.replace("…", "...")
+    text = text.replace("s...", "s")
+    # replace repeating \n with just one \n
+    text = re.sub(r"\n+", "\n", text)
+    ntxt = re.sub(r" +", " ", text)
+    # Ensure that ntxt ends with . or ?
+    ntxt = ntxt.strip()
+    if not ntxt.endswith(".") or ntxt.endswith("?"):
+        ntxt += "."
+    ntxt += " [pause]"
+    return ntxt
+@torch.inference_mode()
+def generate(
+    self: Vui,
+    text: str,
+    prompt_codes: Tensor | None = None,
+    temperature: float = 0.5,
+    top_k: int | None = 150,
+    top_p: float | None = None,
+    max_gen_len: int = int(120 * 21.53),
+):
+    text = simple_clean(text)
+    with (
+        torch.autocast("cuda", torch.bfloat16, True),
+        sdpa_kernel([SDPBackend.MATH]),
+        timer("generate"),
+    ):
+        t1 = time.perf_counter()
+        batch_size = 1
+        device = self.device
+        self.dtype
+        self.decoder.allocate_inference_cache(batch_size, device, torch.bfloat16)
+        texts = [text]
+        encoded = self.tokenizer(
+            texts,
+            padding="longest",
+            return_tensors="pt",
+        )
+        input_ids = encoded.input_ids.to(device)
+        text_embeddings = self.token_emb(input_ids)
+        B = batch_size
+        Q = self.config.model.n_quantizers
+        if prompt_codes is None:
+            prompt_codes = torch.zeros(
+                (batch_size, Q, 0), dtype=torch.int64, device=device
+            )
+        else:
+            prompt_codes = prompt_codes[:, :Q].repeat(batch_size, 1, 1)
+        start_offset = prompt_codes.size(-1)
+        pattern = self.pattern_provider.get_pattern(max_gen_len)
+        # this token is used as default value for codes that are not generated yet
+        unknown_token = -1
+        special_token_id = self.config.model.special_token_id
+        # we generate codes up to the max_gen_len that will be mapped to the pattern sequence
+        codes = torch.full(
+            (B, Q, max_gen_len), unknown_token, dtype=torch.int64, device=device
+        )
+        codes[:, :, :start_offset] = prompt_codes
+        sequence, indexes, mask = pattern.build_pattern_sequence(
+            codes, special_token_id
+        )
+        # retrieve the start_offset in the sequence:
+        # it is the first sequence step that contains the `start_offset` timestep
+        start_offset_sequence = pattern.get_first_step_with_timesteps(start_offset)
+        assert start_offset_sequence is not None
+        prev_offset = 0
+        S = sequence.size(-1)
+        do_prefill = True
+        eos = self.config.model.audio_eos_id
+        for offset in range(start_offset_sequence, S):
+            # print(f"{prev_offset}:{offset}")
+            curr_sequence = sequence[..., prev_offset:offset]
+            audio_embeddings = (
+                sum([self.audio_embeddings[q](curr_sequence[:, q]) for q in range(Q)])
+                / Q
+            )
+            if do_prefill:
+                embeddings = torch.cat((text_embeddings, audio_embeddings), dim=1)
+                T = embeddings.size(1)
+                input_pos = torch.arange(0, T, device=device)
+                do_prefill = False
+            else:
+                embeddings = audio_embeddings
+                input_pos = torch.tensor([T], device=device)
+                T += 1
+            out = self.decoder(embeddings, input_pos)
+            if offset == 15:
+                print("TTFB", time.perf_counter() - t1)
+            logits = torch.stack(
+                [self.audio_heads[q](out[:, -1]) for q in range(Q)], dim=1
+            )
+            repetition_penalty = 1.4
+            history_window = 12
+            # Get the history of generated tokens for each quantizer
+            for q in range(Q):
+                # Extract the history window for this quantizer
+                history_start = max(0, offset - history_window)
+                token_history = sequence[0, q, history_start:offset]
+                # Only apply penalty to tokens that appear in the history
+                unique_tokens = torch.unique(token_history)
+                unique_tokens = unique_tokens[unique_tokens != special_token_id]
+                unique_tokens = unique_tokens[unique_tokens != eos]
+                unique_tokens = unique_tokens[unique_tokens != unknown_token]
+                if len(unique_tokens) > 0:
+                    # Apply penalty by dividing the logits for tokens that have appeared recently
+                    logits[0, q, unique_tokens] = (
+                        logits[0, q, unique_tokens] / repetition_penalty
+                    )
+            if offset < 24.53 * 4:
+                logits[..., eos] = -float("inf")
+            probs = F.softmax(logits / temperature, dim=-1)
+            # print(probs.shape)
+            if top_p is not None and top_k is not None:
+                next_codes = sample_top_p_top_k(probs, top_p, top_k)
+            elif top_p is not None and top_p > 0:
+                next_codes = sample_top_p(probs, top_p)
+            elif top_k is not None and top_k > 0:
+                next_codes = sample_top_k(probs, top_k)
+            else:
+                next_codes = multinomial(probs, num_samples=1)
+            next_codes = next_codes.repeat(batch_size, 1, 1)
+            if (probs[..., eos] > 0.95).any():
+                print("breaking at", offset)
+                break
+            valid_mask = mask[..., offset : offset + 1].expand(B, -1, -1)
+            next_codes[~valid_mask] = special_token_id
+            sequence[..., offset : offset + 1] = torch.where(
+                sequence[..., offset : offset + 1] == unknown_token,
+                next_codes,
+                sequence[..., offset : offset + 1],
+            )
+            prev_offset = offset
+        # print(sequence.shape)
+        out_codes, out_indexes, out_mask = pattern.revert_pattern_sequence(
+            sequence, special_token=unknown_token
+        )
+        # sanity checks over the returned codes and corresponding masks
+        # assert (out_codes[..., :max_gen_len] != unknown_token).all()
+        # assert (out_mask[..., :max_gen_len] == 1).all()
+        out_codes = out_codes[..., prompt_codes.shape[-1] : offset]
+        return out_codes[[0]]
+@torch.inference_mode()
+def render(
+    self: Vui,
+    text: str,
+    prompt_codes: Tensor | None = None,
+    temperature: float = 0.5,
+    top_k: int | None = 100,
+    top_p: float | None = None,
+    max_secs: int = 100,
+):
+    """
+    Render audio from text. Uses generate for text < 1000 characters,
+    otherwise breaks text into sections and uses chunking with context.
+    """
+    text = remove_all_invalid_non_speech(text)
+    text = simple_clean(text)
+    SR = self.codec.config.sample_rate
+    HZ = self.codec.hz
+    max_gen_len = int(HZ * max_secs)
+    if len(text) < 1000:
+        codes = generate(
+            self, text, prompt_codes, temperature, top_k, top_p, max_gen_len
+        )
+        codes = codes[..., :-10]
+        audio = self.codec.from_indices(codes)
+        paudio = torchaudio.functional.resample(audio[0], 22050, 16000)
+        results = vad(paudio)
+        if len(results):
+            # Cut the audio based on VAD results, add 200ms silence at end
+            s, e = results[0][0], results[-1][1]
+            return audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
+        raise Exception("Failed to render")
+    # Otherwise we have to do some clever chaining!
+    orig_codes = prompt_codes
+    lines = text.split("\n")
+    audios = []
+    prev_codes = prompt_codes
+    prev_text = ""
+    for i, line in enumerate(lines):
+        run = True
+        while run:
+            current_text = prev_text + "\n" + line if prev_text else line
+            current_text = current_text.strip()
+            current_text = current_text.replace("...", "")
+            current_text = current_text + " [pause]"
+            # Calculate max length based on text length
+            maxlen = int(HZ * int(60 * len(current_text) / 500))
+            try:
+                print("rendering", current_text)
+                with (
+                    torch.nn.attention.sdpa_kernel(torch.nn.attention.SDPBackend.MATH),
+                    torch.autocast("cuda", dtype=torch.bfloat16, enabled=True),
+                ):
+                    codes = generate(
+                        self,
+                        current_text,
+                        prompt_codes=prev_codes,
+                        temperature=temperature,
+                        top_k=top_k,
+                        top_p=top_p,
+                        max_gen_len=maxlen,
+                    )
+                codes = codes[..., :-10]
+                audio = self.codec.from_indices(codes)
+                # Resample for VAD
+                paudio = torchaudio.functional.resample(audio[0], 22050, 16000)
+                results = vad(paudio)
+                run = len(results) == 0
+                if len(results):
+                    prev_text = line
+                    # Cut the audio based on VAD results, add 200ms silence at end
+                    s, e = results[0][0], results[0][1]
+                    codes = codes[..., int(s * HZ) : int(e * HZ)]
+                    prev_codes = codes
+                    audio = audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
+                    audios.append(audio)
+                else:
+                    prev_codes = orig_codes
+                    prev_text = ""
+            except KeyboardInterrupt:
+                break
+            except RuntimeError as e:
+                prev_codes = orig_codes
+                prev_text = ""
+                print(e)
+    return torch.cat(audios, dim=-1)

src/vui/model.py ADDED Viewed

	@@ -0,0 +1,445 @@

+import math
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+from torch import Tensor
+from transformers import AutoTokenizer
+from vui.fluac import Fluac
+from vui.utils import load_what_you_can
+from .config import Config
+from .patterns import DelayedPatternProvider
+from .rope import apply_rotary_emb, precompute_freqs_cis
+class KVCache(nn.Module):
+    def __init__(
+        self,
+        batch_size: int,
+        max_seqlen: int,
+        n_kv_heads: int,
+        head_dim: int,
+        dtype: torch.dtype = torch.bfloat16,
+    ):
+        super().__init__()
+        cache_shape = (batch_size, n_kv_heads, max_seqlen, head_dim)
+        self.register_buffer("k_cache", torch.zeros(cache_shape, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(cache_shape, dtype=dtype))
+    def update(self, input_pos: Tensor, k_val: Tensor, v_val: Tensor):
+        # input_pos: (T,), k_val: (B, nh, T, d)
+        assert input_pos.size(0) == k_val.size(-2)
+        k_out = self.k_cache
+        v_out = self.v_cache
+        k_out[:, :, input_pos] = k_val
+        v_out[:, :, input_pos] = v_val
+        return k_out, v_out
+def repeat_kv(x: torch.Tensor, n_reps: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, n_kv_heads, T, head_dim = x.shape
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, n_kv_heads, n_reps, T, head_dim)
+        .reshape(bs, n_kv_heads * n_reps, T, head_dim)
+    )
+class MHA(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        n_heads: int,
+        n_kv_heads: int,
+        *,
+        block_idx: int,
+        bias: bool = False,
+        dropout: float = 0.0,
+        causal: bool = False,
+        use_rotary_emb: bool = True,
+    ):
+        super().__init__()
+        head_dim = dim // n_heads
+        self.use_rotary_emb = use_rotary_emb
+        self.block_idx = block_idx
+        self.dim = dim
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = head_dim
+        self.dropout = dropout
+        self.causal = causal
+        self.n_reps = n_kv_heads // n_heads
+        qkv_dim = (n_heads + 2 * n_kv_heads) * head_dim
+        self.Wqkv = nn.Linear(dim, qkv_dim, bias=bias)
+        self.out_proj = nn.Linear(dim, dim, bias=bias)
+        self.kv_cache = None
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor | None = None,
+        input_pos: Tensor | None = None,
+        attn_mask: Tensor | None = None,
+    ):
+        B, T, d = x.size()
+        x.dtype
+        dropout_p = self.dropout if self.training else 0.0
+        qkv = self.Wqkv(x)
+        if self.n_heads == self.n_kv_heads:
+            qkv = rearrange(
+                qkv, "B T (three h d) -> B three h T d", three=3, h=self.n_heads
+            )
+            q, k, v = qkv.unbind(dim=1)  # (B, h, T, d)
+        else:
+            q, k, v = torch.split(
+                qkv,
+                [
+                    self.head_dim * self.n_heads,
+                    self.head_dim * self.n_kv_heads,
+                    self.head_dim * self.n_kv_heads,
+                ],
+                dim=1,
+            )
+            q, k, v = map(lambda t: rearrange(t, "B T (h d) -> B h T d"), (q, k, v))
+        if self.use_rotary_emb:
+            q = apply_rotary_emb(freqs_cis, q)
+            k = apply_rotary_emb(freqs_cis, k)
+        if self.kv_cache is not None:
+            k, v = self.kv_cache.update(input_pos, k, v)
+        if self.n_reps > 1:
+            k = repeat_kv(k, self.n_reps)
+            v = repeat_kv(v, self.n_reps)
+        is_causal = self.causal and self.kv_cache is None
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            dropout_p=dropout_p,
+            is_causal=is_causal,
+            attn_mask=attn_mask,
+        )
+        out = self.out_proj(rearrange(out, "B h T d -> B T (h d)"))
+        return out
+class MLP(nn.Module):
+    def __init__(
+        self, *, d_model: int, bias: bool, dropout: float, act=nn.GELU, **kwargs
+    ):
+        super().__init__()
+        self.fc1 = nn.Linear(d_model, 4 * d_model, bias=bias)
+        self.act = act()
+        self.fc2 = nn.Linear(4 * d_model, d_model, bias=bias)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        return self.dropout(self.fc2(self.act(self.fc1(x))))
+class LlamaMLP(nn.Module):
+    def __init__(
+        self, *, d_model: int, multiple_of: int = 256, bias: bool = False, **kwargs
+    ) -> None:
+        super().__init__()
+        hidden_dim = 4 * d_model
+        hidden_dim = int(2 * hidden_dim / 3)
+        hidden_dim = multiple_of * ((hidden_dim + multiple_of - 1) // multiple_of)
+        self.w1 = nn.Linear(d_model, hidden_dim, bias=bias)
+        self.w3 = nn.Linear(d_model, hidden_dim, bias=bias)
+        self.w2 = nn.Linear(hidden_dim, d_model, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.w2(F.silu(self.w1(x)) * self.w3(x))
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+    def forward(self, x: Tensor):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+class Block(nn.Module):
+    def __init__(
+        self,
+        *,
+        d_model: int,
+        n_heads: int,
+        n_kv_heads: int,
+        block_idx: int,
+        bias: bool,
+        dropout: float,
+        norm_eps: float = 1e-5,  # use 1e-6 for rms
+        use_rotary_emb: bool = True,
+    ):
+        super().__init__()
+        self.block_idx = block_idx
+        self.n_heads = n_heads
+        self.n_kv_heads = n_kv_heads
+        self.head_dim = d_model // n_heads
+        self.attn_norm = RMSNorm(d_model, eps=norm_eps)
+        self.attn = MHA(
+            d_model,
+            n_heads,
+            n_kv_heads,
+            block_idx=block_idx,
+            bias=bias,
+            dropout=dropout,
+            causal=True,
+            use_rotary_emb=use_rotary_emb,
+        )
+        self.mlp_norm = RMSNorm(d_model, eps=norm_eps)
+        self.mlp = LlamaMLP(d_model=d_model, bias=bias, dropout=dropout)
+    def forward(
+        self,
+        x: Tensor,
+        freqs_cis: Tensor | None = None,
+        input_pos: Tensor | None = None,
+        attn_mask: Tensor | None = None,
+    ):
+        x = x + self.attn(
+            self.attn_norm(x),
+            freqs_cis=freqs_cis,
+            input_pos=input_pos,
+            attn_mask=attn_mask,
+        )
+        x = x + self.mlp(self.mlp_norm(x))
+        return x
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        *,
+        n_layers: int,
+        d_model: int,
+        n_heads: int,
+        n_kv_heads: int,
+        bias: bool,
+        dropout: float,
+        max_seqlen: int = 4096,
+        rope_theta: float = 10000.0,
+        rope_theta_rescale_factor: float = 1.0,
+        norm_eps: float = 1e-5,
+        use_rotary_emb: bool = True,
+        rope_dim: int | None = None,
+    ):
+        super().__init__()
+        assert d_model % n_heads == 0
+        self.use_rotary_emb = use_rotary_emb
+        self.max_seqlen = max_seqlen
+        self.blocks = nn.ModuleList(
+            [
+                Block(
+                    d_model=d_model,
+                    n_heads=n_heads,
+                    n_kv_heads=n_kv_heads,
+                    block_idx=block_idx,
+                    bias=bias,
+                    dropout=dropout,
+                    norm_eps=norm_eps,
+                    use_rotary_emb=use_rotary_emb,
+                )
+                for block_idx in range(n_layers)
+            ]
+        )
+        self.norm = RMSNorm(d_model, eps=norm_eps)
+        self.attn_mask = None
+        head_dim = d_model // n_heads
+        rope_dim = rope_dim or head_dim
+        assert rope_dim <= head_dim  # apply RoPE to a fraction of embeddings
+        freqs_cis = precompute_freqs_cis(
+            rope_dim,
+            max_seqlen,
+            theta=rope_theta,
+            theta_rescale_factor=rope_theta_rescale_factor,
+        )
+        self.register_buffer("freqs_cis", freqs_cis, persistent=False)
+    def allocate_inference_cache(
+        self, batch_size: int, device: str, dtype=torch.bfloat16
+    ):
+        for block in self.blocks:
+            block.attn.kv_cache = KVCache(
+                batch_size, self.max_seqlen, block.n_kv_heads, block.head_dim, dtype
+            ).to(device)
+        # I don't understand why this is needed
+        self.attn_mask = torch.tril(
+            torch.ones(
+                self.max_seqlen, self.max_seqlen, dtype=torch.bool, device=device
+            )
+        )
+    def deallocate_kv_cache(self):
+        for block in self.blocks:
+            block.attn.kv_cache = None
+        self.attn_mask = None
+    def forward(self, x: Tensor, input_pos: Tensor):
+        if self.use_rotary_emb:
+            freqs_cis = self.freqs_cis[input_pos]
+        else:
+            freqs_cis = None
+        attn_mask = (
+            self.attn_mask[None, None, input_pos]
+            if self.attn_mask is not None
+            else None
+        )
+        for block in self.blocks:
+            x = block(x, freqs_cis=freqs_cis, input_pos=input_pos, attn_mask=attn_mask)
+        x = self.norm(x)
+        return x
+class Vui(nn.Module):
+    BASE = "vui-100m-base.pt"
+    COHOST = "vui-cohost-100m.pt"
+    ABRAHAM = "vui-abraham-100m.pt"
+    def __init__(self, config: Config = Config()):
+        super().__init__()
+        self.codec = Fluac.from_pretrained()
+        self.config = config
+        cfg = config.model
+        self.tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")
+        self.use_rotary_emb = cfg.use_rotary_emb
+        self.token_emb = nn.Embedding(self.tokenizer.vocab_size, cfg.d_model)
+        self.pattern_provider = DelayedPatternProvider(n_q=cfg.n_quantizers)
+        self.audio_embeddings = nn.ModuleList(
+            [
+                nn.Embedding(cfg.codebook_size + 8, cfg.d_model)
+                for _ in range(cfg.n_quantizers)
+            ]
+        )
+        n_kv_heads = cfg.n_heads
+        max_seqlen = cfg.max_text_tokens + cfg.max_audio_tokens
+        self.decoder = Decoder(
+            n_layers=cfg.n_layers,
+            d_model=cfg.d_model,
+            n_heads=cfg.n_heads,
+            n_kv_heads=n_kv_heads,
+            bias=cfg.bias,
+            dropout=cfg.dropout,
+            max_seqlen=max_seqlen + cfg.n_quantizers,
+            rope_dim=cfg.rope_dim,
+            rope_theta=cfg.rope_theta,
+            rope_theta_rescale_factor=cfg.rope_theta_rescale_factor,
+        )
+        self.audio_heads = nn.ModuleList(
+            [
+                nn.Linear(cfg.d_model, cfg.codebook_size + 8, bias=cfg.bias)
+                for _ in range(cfg.n_quantizers)
+            ]
+        )
+        self.apply(self._init_weights)
+        for pn, p in self.named_parameters():
+            if pn.endswith("out_proj.weight"):
+                torch.nn.init.normal_(
+                    p, mean=0.0, std=0.02 / math.sqrt(2 * cfg.n_layers)
+                )
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+            if module.bias is not None:
+                torch.nn.init.zeros_(module.bias)
+        elif isinstance(module, nn.Embedding):
+            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
+    @staticmethod
+    def from_pretrained(
+        checkpoint_path: str | dict = ABRAHAM,
+        **config_kwargs,
+    ):
+        if isinstance(checkpoint_path, dict):
+            checkpoint = checkpoint_path
+        else:
+            if not os.path.exists(checkpoint_path):
+                from huggingface_hub import hf_hub_download
+                checkpoint_path = hf_hub_download(
+                    "fluxions/vui",
+                    checkpoint_path,
+                )
+            checkpoint = torch.load(
+                checkpoint_path, map_location="cpu", weights_only=True
+            )
+        config = {**checkpoint["config"], **config_kwargs}
+        config = Config(**config)
+        state_dict = checkpoint["model"]
+        state_dict = {k.replace("module.", ""): v for k, v in state_dict.items()}
+        state_dict = {
+            k.replace("text_embedding.", "token_emb."): v for k, v in state_dict.items()
+        }
+        model = Vui(config)
+        load_what_you_can(state_dict, model)
+        return model
+    @staticmethod
+    def from_pretrained_inf(
+        checkpoint_path: str | dict,
+        **config_kwargs,
+    ):
+        return Vui.from_pretrained(checkpoint_path, **config_kwargs).eval()
+    @property
+    def device(self):
+        return next(self.parameters()).device
+    @property
+    def dtype(self):
+        return next(self.parameters()).dtype

src/vui/notebook.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import torch
+def play(audio: torch.Tensor | np.ndarray | str, sr=16000, autoplay=True):
+    import torchaudio
+    from IPython.display import Audio, display
+    if isinstance(audio, str):
+        audio = torchaudio.load(audio)
+    if isinstance(audio, np.ndarray):
+        audio = torch.from_numpy(audio)
+    assert audio.numel() > 100, "play() needs a non empty audio array"
+    audio = audio.flatten()
+    if audio.dim() < 2:
+        audio = audio[None]
+    # Sum Channels
+    if audio.shape[0] > 1:
+        audio = audio.sum(dim=0)
+    display(Audio(audio.cpu().detach(), rate=sr, autoplay=autoplay, normalize=True))
+def plot_mel_spec(mel_spec: torch.Tensor | np.ndarray, title: str = None):
+    import matplotlib.pyplot as plt
+    mel_spec = mel_spec.squeeze()
+    if isinstance(mel_spec, torch.Tensor):
+        mel_spec = mel_spec.cpu().numpy()
+    fig, ax = plt.subplots(figsize=(16, 4))
+    im = ax.imshow(mel_spec, aspect="auto", origin="lower", interpolation="none")
+    fig.colorbar(im, ax=ax)
+    ax.set_xlabel("frames")
+    ax.set_ylabel("channels")
+    if title is not None:
+        ax.set_title(title)

src/vui/patterns.py ADDED Viewed

	@@ -0,0 +1,423 @@

+import logging
+from abc import ABC, abstractmethod
+from collections import namedtuple
+from dataclasses import dataclass
+from functools import lru_cache
+import torch
+import torch.nn.functional as F
+def apply_delay_pattern(codes: torch.Tensor, mask_token: int):
+    codes = F.pad(codes, (0, codes.shape[1] + 1), value=mask_token)
+    return torch.stack([codes[:, k].roll(k + 1) for k in range(codes.shape[1])], dim=1)
+def revert_delay_pattern(codes: torch.Tensor):
+    _, n_q, seq_len = codes.shape
+    return torch.stack(
+        [codes[:, k, k + 1 : seq_len - n_q + k] for k in range(n_q)], dim=1
+    )
+LayoutCoord = namedtuple("LayoutCoord", ["t", "q"])  # (timestep, codebook index)
+PatternLayout = list[list[LayoutCoord]]  # Sequence of coordinates
+logger = logging.getLogger(__name__)
+@dataclass
+class Pattern:
+    """Base implementation of a pattern over a sequence with multiple codebooks.
+    The codebook pattern consists in a layout, defining for each sequence step
+    the list of coordinates of each codebook timestep in the resulting interleaved sequence.
+    The first item of the pattern is always an empty list in order to properly insert a special token
+    to start with. For convenience, we also keep track of ``n_q`` the number of codebooks used for the pattern
+    and ``timesteps`` the number of timesteps corresponding to the original sequence.
+    The pattern provides convenient methods to build and revert interleaved sequences from it:
+    ``build_pattern_sequence`` maps a given a dense input tensor of multi-codebook sequence from [B, K, T]
+        to the interleaved sequence of shape [B, K, S] applying the pattern, with B being the batch size,
+        K being the number of codebooks, T the number of original timesteps and S the number of sequence steps
+        for the output sequence. The unfilled positions are replaced with a special token and the built sequence
+        is returned along with a mask indicating valid tokens.
+    ``revert_pattern_sequence`` maps back an interleaved sequence of shape [B, K, S] to the original alignment
+        of codebooks across timesteps to an output tensor of shape [B, K, T], using again a special token and a mask
+        to fill and specify invalid positions if needed.
+    See the dedicated methods for more details.
+    """
+    # Pattern layout, for each sequence step, we have a list of coordinates
+    # corresponding to the original codebook timestep and position.
+    # The first list is always an empty list in order to properly insert
+    # a special token to start with.
+    layout: PatternLayout
+    timesteps: int
+    n_q: int
+    def __post_init__(self):
+        assert len(self.layout) > 0
+        self._validate_layout()
+        self._build_reverted_sequence_scatter_indexes = lru_cache(100)(
+            self._build_reverted_sequence_scatter_indexes
+        )
+        self._build_pattern_sequence_scatter_indexes = lru_cache(100)(
+            self._build_pattern_sequence_scatter_indexes
+        )
+        logger.info(
+            "New pattern, time steps: %d, sequence steps: %d",
+            self.timesteps,
+            len(self.layout),
+        )
+    def _validate_layout(self):
+        """Runs checks on the layout to ensure a valid pattern is defined.
+        A pattern is considered invalid if:
+            - Multiple timesteps for a same codebook are defined in the same sequence step
+            - The timesteps for a given codebook are not in ascending order as we advance in the sequence
+              (this would mean that we have future timesteps before past timesteps).
+        """
+        q_timesteps = {q: 0 for q in range(self.n_q)}
+        for s, seq_coords in enumerate(self.layout):
+            if len(seq_coords) > 0:
+                qs = set()
+                for coord in seq_coords:
+                    qs.add(coord.q)
+                    last_q_timestep = q_timesteps[coord.q]
+                    assert (
+                        coord.t >= last_q_timestep
+                    ), f"Past timesteps are found in the sequence for codebook = {coord.q} at step {s}"
+                    q_timesteps[coord.q] = coord.t
+                # each sequence step contains at max 1 coordinate per codebook
+                assert len(qs) == len(
+                    seq_coords
+                ), f"Multiple entries for a same codebook are found at step {s}"
+    @property
+    def num_sequence_steps(self):
+        return len(self.layout) - 1
+    @property
+    def max_delay(self):
+        max_t_in_seq_coords = 0
+        for seq_coords in self.layout[1:]:
+            for coords in seq_coords:
+                max_t_in_seq_coords = max(max_t_in_seq_coords, coords.t + 1)
+        return max_t_in_seq_coords - self.timesteps
+    @property
+    def valid_layout(self):
+        valid_step = len(self.layout) - self.max_delay
+        return self.layout[:valid_step]
+    def starts_with_special_token(self):
+        return self.layout[0] == []
+    def get_sequence_coords_with_timestep(self, t: int, q: int | None = None):
+        """Get codebook coordinates in the layout that corresponds to the specified timestep t
+        and optionally to the codebook q. Coordinates are returned as a tuple with the sequence step
+        and the actual codebook coordinates.
+        """
+        assert (
+            t <= self.timesteps
+        ), "provided timesteps is greater than the pattern's number of timesteps"
+        if q is not None:
+            assert (
+                q <= self.n_q
+            ), "provided number of codebooks is greater than the pattern's number of codebooks"
+        coords = []
+        for s, seq_codes in enumerate(self.layout):
+            for code in seq_codes:
+                if code.t == t and (q is None or code.q == q):
+                    coords.append((s, code))
+        return coords
+    def get_steps_with_timestep(self, t: int, q: int | None = None) -> list[int]:
+        return [step for step, coords in self.get_sequence_coords_with_timestep(t, q)]
+    def get_first_step_with_timesteps(self, t: int, q: int | None = None) -> int | None:
+        steps_with_timesteps = self.get_steps_with_timestep(t, q)
+        return steps_with_timesteps[0] if len(steps_with_timesteps) > 0 else None
+    def _build_pattern_sequence_scatter_indexes(
+        self,
+        timesteps: int,
+        n_q: int,
+        keep_only_valid_steps: bool,
+        device: torch.device | str = "cpu",
+    ):
+        """Build scatter indexes corresponding to the pattern, up to the provided sequence_steps.
+        Args:
+            timesteps (int): Maximum number of timesteps steps to consider.
+            keep_only_valid_steps (bool): Restrict the pattern layout to match only valid steps.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes corresponding to the sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes, of shape [K, S].
+        """
+        assert (
+            n_q == self.n_q
+        ), f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
+        assert (
+            timesteps <= self.timesteps
+        ), "invalid number of timesteps used to build the sequence from the pattern"
+        # use the proper layout based on whether we limit ourselves to valid steps only or not,
+        # note that using the valid_layout will result in a truncated sequence up to the valid steps
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, len(ref_layout), dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, len(ref_layout), dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        # the last value is n_q * timesteps as we have flattened z and append special token as the last token
+        # which will correspond to the index: n_q * timesteps
+        indexes[:] = n_q * timesteps
+        # iterate over the pattern and fill scattered indexes and mask
+        for s, sequence_coords in enumerate(ref_layout):
+            for coords in sequence_coords:
+                if coords.t < timesteps:
+                    indexes[coords.q, s] = coords.t + coords.q * timesteps
+                    mask[coords.q, s] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+    def build_pattern_sequence(
+        self, z: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False
+    ):
+        """Build sequence corresponding to the pattern from the input tensor z.
+        The sequence is built using up to sequence_steps if specified, and non-pattern
+        coordinates are filled with the special token.
+        Args:
+            z (torch.Tensor): Input tensor of multi-codebooks sequence, of shape [B, K, T].
+            special_token (int): Special token used to fill non-pattern coordinates in the new sequence.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, S] with S
+                corresponding either to the sequence_steps if provided, otherwise to the length of the pattern.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, S].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, S].
+        """
+        B, K, T = z.shape
+        indexes, mask = self._build_pattern_sequence_scatter_indexes(
+            T, K, keep_only_valid_steps=keep_only_valid_steps, device=str(z.device)
+        )
+        z = z.reshape(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        z = torch.cat([z, torch.zeros_like(z[:, :1]) + special_token], dim=1)
+        values = z[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+    def _build_reverted_sequence_scatter_indexes(
+        self,
+        sequence_steps: int,
+        n_q: int,
+        keep_only_valid_steps: bool = False,
+        is_model_output: bool = False,
+        device: torch.device | str = "cpu",
+    ):
+        """Builds scatter indexes required to retrieve the original multi-codebook sequence
+        from interleaving pattern.
+        Args:
+            sequence_steps (int): Sequence steps.
+            n_q (int): Number of codebooks.
+            keep_only_valid_steps (bool): Build a sequence from the pattern up to valid (= fully defined) steps.
+                Steps that are beyond valid steps will be replaced by the special_token in that case.
+            is_model_output (bool): Whether to keep the sequence item corresponding to initial special token or not.
+            device (torch.device or str): Device for created tensors.
+        Returns:
+            indexes (torch.Tensor): Indexes for reconstructing the output, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        """
+        ref_layout = self.valid_layout if keep_only_valid_steps else self.layout
+        timesteps = self.timesteps
+        assert (
+            n_q == self.n_q
+        ), f"invalid number of codebooks for the sequence and the pattern: {n_q} != {self.n_q}"
+        assert sequence_steps <= len(
+            ref_layout
+        ), f"sequence to revert is longer than the defined pattern: {sequence_steps} > {len(ref_layout)}"
+        # ensure we take the appropriate indexes to keep the model output from the first special token as well
+        if is_model_output and self.starts_with_special_token():
+            ref_layout = ref_layout[1:]
+        # single item indexing being super slow with pytorch vs. numpy, so we use numpy here
+        indexes = torch.zeros(n_q, timesteps, dtype=torch.long).numpy()
+        mask = torch.zeros(n_q, timesteps, dtype=torch.bool).numpy()
+        # fill indexes with last sequence step value that will correspond to our special token
+        indexes[:] = n_q * sequence_steps
+        for s, sequence_codes in enumerate(ref_layout):
+            if s < sequence_steps:
+                for code in sequence_codes:
+                    if code.t < timesteps:
+                        indexes[code.q, code.t] = s + code.q * sequence_steps
+                        mask[code.q, code.t] = 1
+        indexes = torch.from_numpy(indexes).to(device)
+        mask = torch.from_numpy(mask).to(device)
+        return indexes, mask
+    def revert_pattern_sequence(
+        self, s: torch.Tensor, special_token: int, keep_only_valid_steps: bool = False
+    ):
+        """Revert a sequence built from the pattern back to the original multi-codebook sequence without interleaving.
+        The sequence is reverted using up to timesteps if specified, and non-pattern coordinates
+        are filled with the special token.
+        Args:
+            s (torch.Tensor): Interleaved sequence tensor obtained from the pattern, of shape [B, K, S].
+            special_token (int or float): Special token used to fill non-pattern coordinates in the new sequence.
+        Returns:
+            values (torch.Tensor): Interleaved sequence matching the pattern, of shape [B, K, T] with T
+                corresponding either to the timesteps if provided, or the total timesteps in pattern otherwise.
+            indexes (torch.Tensor): Indexes corresponding to the interleaved sequence, of shape [K, T].
+            mask (torch.Tensor): Mask corresponding to indexes that matches valid indexes of shape [K, T].
+        """
+        B, K, S = s.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, K, keep_only_valid_steps, is_model_output=False, device=str(s.device)
+        )
+        s = s.view(B, -1)
+        # we append the special token as the last index of our flattened z tensor
+        s = torch.cat([s, torch.zeros_like(s[:, :1]) + special_token], dim=1)
+        values = s[:, indexes.view(-1)]
+        values = values.view(B, K, indexes.shape[-1])
+        return values, indexes, mask
+    def revert_pattern_logits(
+        self,
+        logits: torch.Tensor,
+        special_token: float,
+        keep_only_valid_steps: bool = False,
+    ):
+        """Revert model logits obtained on a sequence built from the pattern
+        back to a tensor matching the original sequence.
+        This method is similar to ``revert_pattern_sequence`` with the following specificities:
+        1. It is designed to work with the extra cardinality dimension
+        2. We return the logits for the first sequence item that matches the special_token and
+        which matching target in the original sequence is the first item of the sequence,
+        while we skip the last logits as there is no matching target
+        """
+        B, n, Q, S = logits.shape
+        indexes, mask = self._build_reverted_sequence_scatter_indexes(
+            S, Q, keep_only_valid_steps, is_model_output=True, device=logits.device
+        )
+        logits = logits.reshape(B, n, -1)
+        # we append the special token as the last index of our flattened z tensor
+        logits = torch.cat(
+            [logits, torch.zeros_like(logits[:, :, :1]) + special_token], dim=-1
+        )  # [B, card, K x S]
+        values = logits[:, :, indexes.view(-1)]
+        values = values.view(B, n, Q, indexes.shape[-1])
+        return values, indexes, mask
+class CodebooksPatternProvider(ABC):
+    """Abstraction around providing pattern for interleaving codebooks.
+    The CodebooksPatternProvider abstraction allows to implement various strategies to
+    define interleaving pattern of sequences composed of multiple codebooks. For a given
+    number of codebooks `n_q`, the pattern provider can generate a specified pattern
+    corresponding to a sequence of `T` timesteps with `n_q` parallel codebooks. This pattern
+    can be used to construct a new sequence from the original codes respecting the specified
+    pattern. The pattern is defined as a list of list of code coordinates, code coordinate
+    being a tuple with the original timestep and codebook to build the new sequence.
+    Note that all patterns must start with an empty list that is then used to insert a first
+    sequence step of special tokens in the newly generated sequence.
+    Args:
+        n_q (int): number of codebooks.
+        cached (bool): if True, patterns for a given length are cached. In general
+            that should be true for efficiency reason to avoid synchronization points.
+    """
+    def __init__(self, n_q: int, cached: bool = True):
+        assert n_q > 0
+        self.n_q = n_q
+        self.get_pattern = lru_cache(100)(self.get_pattern)  # type: ignore
+    @abstractmethod
+    def get_pattern(self, timesteps: int) -> Pattern:
+        """Builds pattern with specific interleaving between codebooks.
+        Args:
+            timesteps (int): Total number of timesteps.
+        """
+        raise NotImplementedError()
+class DelayedPatternProvider(CodebooksPatternProvider):
+    """Provider for delayed pattern across delayed codebooks.
+    Codebooks are delayed in the sequence and sequence steps will contain codebooks
+    from different timesteps.
+    Example:
+        Taking timesteps=4 and n_q=3, delays=None, the multi-codebook sequence:
+        [[1, 2, 3, 4],
+        [1, 2, 3, 4],
+        [1, 2, 3, 4]]
+        The resulting sequence obtained from the returned pattern is:
+        [[S, 1, 2, 3, 4],
+        [S, S, 1, 2, 3],
+        [S, S, S, 1, 2]]
+        (with S being a special token)
+    Args:
+        n_q (int): Number of codebooks.
+        delays (list of int, optional): Delay for each of the codebooks.
+            If delays not defined, each codebook is delayed by 1 compared to the previous one.
+        flatten_first (int): Flatten the first N timesteps.
+        empty_initial (int): Prepend with N empty list of coordinates.
+    """
+    def __init__(
+        self,
+        n_q: int,
+        delays: list[int] | None = None,
+        flatten_first: int = 0,
+        empty_initial: int = 0,
+    ):
+        super().__init__(n_q)
+        if delays is None:
+            delays = list(range(n_q))
+        self.delays = delays
+        self.flatten_first = flatten_first
+        self.empty_initial = empty_initial
+        assert len(self.delays) == self.n_q
+        assert sorted(self.delays) == self.delays
+    def get_pattern(self, timesteps: int) -> Pattern:
+        omit_special_token = self.empty_initial < 0
+        out: PatternLayout = [] if omit_special_token else [[]]
+        max_delay = max(self.delays)
+        if self.empty_initial:
+            out += [[] for _ in range(self.empty_initial)]
+        if self.flatten_first:
+            for t in range(min(timesteps, self.flatten_first)):
+                for q in range(self.n_q):
+                    out.append([LayoutCoord(t, q)])
+        for t in range(self.flatten_first, timesteps + max_delay):
+            v = []
+            for q, delay in enumerate(self.delays):
+                t_for_q = t - delay
+                if t_for_q >= self.flatten_first:
+                    v.append(LayoutCoord(t_for_q, q))
+            out.append(v)
+        return Pattern(out, n_q=self.n_q, timesteps=timesteps)
+if __name__ == "__main__":
+    # Tried to use the simple patterns to train and something very odd happened.
+    Q = 4
+    codes = torch.randint(0, 1000, (1, Q, 100))
+    pcodes = apply_delay_pattern(codes, 1001)
+    provider = DelayedPatternProvider(Q)
+    provider = provider.get_pattern(100)
+    pcodes2 = provider.build_pattern_sequence(codes, 1001)
+    breakpoint()

src/vui/rope.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from torch.amp import autocast
+def rotate_half(x):
+    """Also known as "interleaved" style or GPT-J style."""
+    x = rearrange(x, "... (d r) -> ... d r", r=2)
+    x1, x2 = x.unbind(dim=-1)
+    x = torch.stack((-x2, x1), dim=-1)
+    return rearrange(x, "... d r -> ... (d r)")
+@autocast("cuda", enabled=False)
+def apply_rotary_emb(
+    freqs: Tensor, t: Tensor, start_index: int = 0, scale: float = 1.0, seq_dim=-2
+):
+    dtype = t.dtype
+    if t.ndim == 3:
+        seq_len = t.shape[seq_dim]
+        freqs = freqs[-seq_len:]
+    rot_dim = freqs.shape[-1]
+    end_index = start_index + rot_dim
+    assert (
+        rot_dim <= t.shape[-1]
+    ), f"feature dimension {t.shape[-1]} is not of sufficient size to rotate in all the positions {rot_dim}"
+    t_left, t, t_right = (
+        t[..., :start_index],
+        t[..., start_index:end_index],
+        t[..., end_index:],
+    )
+    t = (t * freqs.cos() * scale) + (rotate_half(t) * freqs.sin() * scale)
+    out = torch.cat((t_left, t, t_right), dim=-1)
+    return out.to(dtype)
+def precompute_freqs_cis(
+    dim: int,
+    max_seqlen: int,
+    theta: float = 10_000.0,
+    theta_rescale_factor: float = 1.0,
+    dtype: torch.dtype = torch.float32,
+):
+    theta *= theta_rescale_factor ** (dim / (dim - 2))
+    pos = torch.arange(max_seqlen, dtype=dtype)
+    inv_freqs = 1.0 / (theta ** (torch.arange(0, dim, 2, dtype=dtype) / dim))
+    freqs = torch.einsum("..., f -> ... f", pos.to(inv_freqs.dtype), inv_freqs)
+    freqs = repeat(freqs, "... n -> ... (n r)", r=2)
+    return freqs

src/vui/sampling.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+from torch import Tensor
+def multinomial(input: Tensor, num_samples: int, replacement=False, *, generator=None):
+    input_ = input.reshape(-1, input.shape[-1])
+    output_ = torch.multinomial(
+        input_, num_samples=num_samples, replacement=replacement, generator=generator
+    )
+    output = output_.reshape(*list(input.shape[:-1]), -1)
+    return output
+def sample_top_k(probs: Tensor, k: int) -> Tensor:
+    top_k_value, _ = torch.topk(probs, k, dim=-1)
+    min_value_top_k = top_k_value[..., [-1]]
+    probs *= (probs >= min_value_top_k).float()
+    probs.div_(probs.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs, num_samples=1)
+    return next_token
+def sample_top_p(probs: Tensor, p: float) -> Tensor:
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+def sample_top_p_top_k(probs: Tensor, p: float, top_k: int):
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort *= (~mask).float()
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = sample_top_k(probs_sort, top_k)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token

src/vui/tok.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import torch
+from transformers import ByT5Tokenizer
+class CustomByT5Tokenizer(ByT5Tokenizer):
+    def encode(self, text, add_special_tokens=False, **kwargs):
+        """
+        Override the encode method.
+        Args:
+            text (str): Input text
+            add_special_tokens (bool): Whether to add BOS/EOS tokens
+        """
+        # Use the parent class's encode method
+        tokens = super().encode(text, add_special_tokens=add_special_tokens, **kwargs)
+        return torch.tensor(tokens)
+tok = CustomByT5Tokenizer.from_pretrained("google/byt5-small")

src/vui/utils.py ADDED Viewed

	@@ -0,0 +1,422 @@

+import math
+import time
+from functools import partial
+import torch
+import torch.nn.functional as F
+from torch import Tensor
+def load_what_you_can(checkpoint: dict, model: torch.nn.Module):
+    """
+    This method takes a checkpoint and loads as many weights from it as possible:
+    If they are the same shape, there's nothing to do
+    Will load the smallest shape otherwise.
+    """
+    import torch
+    model_state_dict = model.state_dict()
+    checkpoint_state_dict = checkpoint
+    for name, param in checkpoint_state_dict.items():
+        if name not in model_state_dict:
+            print(f"Ignoring parameter '{name}' because it is not found in the model")
+            continue
+        model_state = model_state_dict[name]
+        mshape = model_state.shape
+        pshape = param.shape
+        if pshape == mshape:
+            model_state.copy_(param)
+            continue
+        if len(pshape) != len(mshape):
+            # Completely different shapes so probably unwise to merge
+            continue
+        min_shape = [
+            min(param.shape[i], model_state.shape[i]) for i in range(len(param.shape))
+        ]
+        print(name, "model:", mshape, "chkpt:", pshape, "loading:", min_shape)
+        idxs = torch.meshgrid(*[torch.arange(s) for s in min_shape])
+        model_state[tuple(idxs)].copy_(param[tuple(idxs)])
+    return model.load_state_dict(model_state_dict)
+def multimap(
+    items: list, func: callable, workers=4, desc=None, thread=False, chunk_size=128
+) -> list:
+    """
+    Quick and dirty multiprocessing that will return the result of func if it returns None
+    """
+    from tqdm.contrib.concurrent import process_map, thread_map
+    m = thread_map if thread else process_map
+    length = None
+    try:
+        length = len(items)
+    except Exception as e:
+        print(e, "getting length")
+    results = m(
+        func,
+        items,
+        leave=False,
+        desc=desc,
+        max_workers=workers,
+        total=length,
+        chunksize=chunk_size,
+    )
+    return list(filter(lambda x: x is not None, results))
+def round_up(num: float, factor: int):
+    return factor * math.ceil(num / factor)
+def left_padding_mask(lengths, max_len, device=None, dtype=None):
+    masks = []
+    if not max_len:
+        max_len = max(lengths)
+    for l in lengths:
+        mask = torch.empty(l, l, device=device, dtype=dtype).fill_(-torch.inf).triu_(1)
+        diff = max_len - l
+        mask = F.pad(mask, (diff, 0, diff, 0), value=-torch.inf)
+        masks.append(mask)
+    masks = torch.stack(masks)
+    return masks[:, None]
+def seed_all(seed: int):
+    import random
+    import numpy as np
+    import torch
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+    random.seed(seed)
+def split_bucket_path(url: str) -> tuple[str, str]:
+    url = url.replace("s3://", "")
+    url = url.replace("sj://", "")
+    url = url.replace("r2://", "")
+    bucket = url.split("/")[0]
+    path = "/".join(url.split("/")[1:])
+    return bucket, path
+def prob_mask_like(shape, prob: float, device):
+    import torch
+    if prob == 1:
+        return torch.ones(shape, device=device, dtype=torch.bool)
+    elif prob == 0:
+        return torch.zeros(shape, device=device, dtype=torch.bool)
+    else:
+        return torch.zeros(shape, device=device).float().uniform_(0, 1) < prob
+def round_up_to_multiple(n: int, multiple: int) -> int:
+    if n % multiple != 0:
+        n += multiple - (n % multiple)
+    return n
+def warmup_then_cosine_decay(
+    step: int, *, warmup_steps: int, steps: int, min_lr: float, max_lr: float
+):
+    eps = 1e-9
+    cooldown_steps = warmup_steps
+    if step < warmup_steps:
+        return min_lr + step * (max_lr - min_lr) / (warmup_steps)
+    elif step > steps:
+        return min_lr
+    elif step < steps - cooldown_steps:
+        decay_ratio = (step - warmup_steps) / (steps - warmup_steps - cooldown_steps)
+        # assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
+        return min_lr + coeff * (max_lr - min_lr)
+    else:
+        # decay from min_lr to 0
+        return min_lr * (steps - step) / cooldown_steps + eps
+def decay_to_zero(step: int, *, decay_steps: int, steps: int, max_lr: float):
+    if step > steps:
+        return 0.0
+    else:
+        gradient = -max_lr / decay_steps
+        return max_lr + gradient * step
+def cross_entropy_loss(logits, mask, targets):
+    import torch
+    import torch.nn.functional as F
+    B, Q, T, _ = logits.size()
+    assert logits.shape[:-1] == targets.shape
+    assert mask.shape == targets.shape
+    loss = torch.zeros([], device=targets.device)
+    codebook_losses = []
+    for q in range(Q):
+        logits_q = (
+            logits[:, q, ...].contiguous().view(-1, logits.size(-1))
+        )  # [B x T, card]
+        targets_q = targets[:, q, ...].contiguous().view(-1)  # [B x T]
+        mask_q = mask[:, q, ...].contiguous().view(-1)  # [B x T]
+        ce_targets = targets_q[mask_q]
+        ce_logits = logits_q[mask_q]
+        q_ce = F.cross_entropy(ce_logits, ce_targets)
+        loss += q_ce
+        codebook_losses.append(q_ce.detach())
+    # average cross entropy across codebooks
+    loss = loss / Q
+    return loss, codebook_losses
+def build_optimizer(
+    module, *, weight_decay: float, lr: float, betas: tuple[float, float]
+):
+    import torch
+    param_dict = {pn: p for pn, p in module.named_parameters() if p.requires_grad}
+    # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+    # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+    decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+    nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+    optim_groups = [
+        {"params": decay_params, "weight_decay": weight_decay},
+        {"params": nodecay_params, "weight_decay": 0.0},
+    ]
+    # num_decay_params = sum(p.numel() for p in decay_params)
+    # num_nodecay_params = sum(p.numel() for p in nodecay_params)
+    # print(
+    #     f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters"
+    # )
+    # print(
+    #     f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters"
+    # )
+    optimizer = torch.optim.AdamW(optim_groups, lr=lr, betas=betas, fused=True)
+    return optimizer
+def pad_or_cut_right(t: Tensor, padlen: int, value=0) -> Tensor:
+    current_len = t.shape[-1]
+    if current_len == padlen:
+        return t
+    if current_len < padlen:
+        # Need to pad
+        pad_size = (0, padlen - current_len)
+        return F.pad(t, pad_size, value=value)
+    # Need to cut
+    return t[:padlen]
+def pad_or_cut_left(t: Tensor, value: int) -> Tensor:
+    dims = t.ndim
+    current_len = t.shape[0]
+    if current_len == value:
+        return t
+    if current_len < value:
+        # Need to pad
+        pad_size = (0,) * (2 * (dims - 1)) + (value - current_len, 0)
+        return F.pad(t, pad_size)
+    # Need to cut
+    return t[-value:]
+def dl_pt(orig: str):
+    from os.path import exists
+    import torch
+    from vui.storage import s3, split_bucket_path
+    if not orig.endswith(".pt"):
+        orig = orig + ".pt"
+    load = partial(torch.load, weights_only=True)
+    if exists(orig):
+        return load(orig)
+    url = "/data/" + orig
+    if exists(url):
+        return load(url)
+    url = "s3://fluxions/" + orig
+    bucket, key = split_bucket_path(url)
+    response = s3.get_object(Bucket=bucket, Key=key)
+    return load(response["Body"])
+def dl_ogg(url: str, start=0, end=-1, sr=None):
+    import re
+    from os.path import exists
+    import soundfile as sf
+    import torch
+    search_sr = re.search(r"(\d+)/", url)
+    if search_sr:
+        sr = int(search_sr.group(1))
+    local_file = exists(url)
+    if exists("/data/audio/" + url):
+        local_file = True
+        url = "/data/audio/" + url
+    if not local_file:
+        from vui.storage import s3
+        url = "s3://fluxions/" + url
+        b, p = split_bucket_path(url)
+        url = s3.get_object(Bucket=b, Key=p)["Body"]
+    if sr is None:
+        if local_file:
+            sr = sf.info(url).samplerate
+        else:
+            sr = sf.info(url.read()).samplerate
+    start_frame = int(start * sr)
+    num_frames = int(end * sr) - start_frame
+    wav, _ = sf.read(url, frames=num_frames, start=start_frame, always_2d=True)
+    wav = torch.from_numpy(wav).float()
+    wav = wav.T.mean(0, keepdim=True)
+    return wav, sr
+class timer:
+    def __init__(self, name=""):
+        self.name = name
+    def __enter__(self):
+        self.t = time.perf_counter()
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        elapsed = time.perf_counter() - self.t
+        print(f"{self.name} {elapsed:.4f}")
+@torch.inference_mode()
+def decode_audio_from_indices(model, indices, chunk_size=64):
+    """
+    Decodes audio from indices in batches to avoid memory issues.
+    Args:
+        model: Codec
+        indices: Tensor of shape (1, n_quantizers, sequence_length)
+        chunk_size: Number of samples to process at once
+    Returns:
+        Tensor of reconstructed audio
+    """
+    device = model.device
+    indices = indices.to(device)
+    _, _, seq_len = indices.shape
+    chunks = seq_len // chunk_size + (1 if seq_len % chunk_size != 0 else 0)
+    audio_chunks = []
+    for i in range(chunks):
+        start_idx = i * chunk_size
+        end_idx = min(start_idx + chunk_size, seq_len)
+        chunk_indices = indices[:, :, start_idx:end_idx]
+        chunk_audio = model.from_indices(chunk_indices)
+        audio_chunks.append(chunk_audio.cpu())
+    full_audio = torch.cat(audio_chunks, dim=-1)
+    return full_audio.flatten()
+def normalize_loudness(waveform, sample_rate: int, lufs: float = -12.0):
+    """
+    Normalize the loudness of an audio tensor using torchaudio.transforms.Loudness.
+    Args:
+    audio_tensor (torch.Tensor): Input audio tensor of shape (channels, samples)
+    sample_rate (int): Sampling rate of the audio
+    target_loudness (float): Target loudness in LUFS (default: -16.0 LUFS)
+    Returns:
+    torch.Tensor: Loudness-normalized audio tensor
+    """
+    import torchaudio
+    # Ensure the input tensor is 2D (add channel dimension if it's 1D)
+    if waveform.ndim == 1:
+        waveform = waveform.unsqueeze(0)
+    # Create a Loudness transform
+    loudness_transform = torchaudio.transforms.Loudness(sample_rate)
+    # Measure the current loudness
+    current_loudness = loudness_transform(waveform)
+    # Calculate the required gain
+    gain_db = lufs - current_loudness
+    # Convert gain from dB to linear scale
+    gain_linear = torch.pow(10, gain_db / 20)
+    # Apply the gain to normalize loudness
+    normalized_audio = waveform * gain_linear
+    return normalized_audio
+def get_basename_without_extension(file_path):
+    from pathlib import Path
+    p = Path(file_path)
+    return p.stem
+def ollama(prompt, MODEL=None):
+    import os
+    import requests
+    OLLAMA_HOST = "http://localhost:11434"
+    API = f"{OLLAMA_HOST}/api/generate"
+    if MODEL is None:
+        MODEL = os.environ.get("OLLAMA_MODEL", "gemma:1b")
+    payload = {
+        "model": MODEL,
+        "prompt": prompt,
+        "stream": False,
+        "options": {"temperature": 0.9, "top_p": 0.9, "max_tokens": 1000},
+    }
+    try:
+        response = requests.post(API, json=payload)
+        response.raise_for_status()  # Raise exception for HTTP errors
+        result = response.json()
+        return result.get("response", "")
+    except requests.exceptions.RequestException as e:
+        print(f"Error calling Ollama API: {e}")
+        return ""
+def decompile_state_dict(state_dict):
+    state_dict = {k.replace("_orig_mod.", ""): v for k, v in state_dict.items()}
+    # state_dict = convert_old_weight_norm_to_new(state_dict)
+    return {k.replace("module.", ""): v for k, v in state_dict.items()}

src/vui/vad.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import hashlib
+import os
+import urllib
+from collections.abc import Callable
+import numpy as np
+import pandas as pd
+import torch
+from pyannote.audio import Model, Pipeline
+from pyannote.audio.core.io import AudioFile
+from pyannote.audio.pipelines import VoiceActivityDetection
+from pyannote.audio.pipelines.utils import PipelineModel
+from pyannote.core import Annotation, Segment, SlidingWindowFeature
+from tqdm import tqdm
+VAD_SEGMENTATION_URL = "https://whisperx.s3.eu-west-2.amazonaws.com/model_weights/segmentation/0b5b3216d60a2d32fc086b47ea8c67589aaeb26b7e07fcbe620d6d0b83e209ea/pytorch_model.bin"
+pipeline = None
+pipeline_name = "pyannote/voice-activity-detection"
+@torch.autocast("cuda", enabled=False)
+def detect_voice_activity(waveform, pipe=None):
+    """16khz"""
+    waveform = waveform.flatten().float()[None]
+    global pipeline
+    if pipe is not None:
+        pipeline = pipe
+    elif pipeline is None:
+        pipeline = Pipeline.from_pretrained(pipeline_name)
+        initial_params = {
+            "onset": 0.8,
+            "offset": 0.5,
+            "min_duration_on": 0,
+            "min_duration_off": 0.0,
+        }
+        pipeline.instantiate(initial_params)
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        pipeline = pipeline.to(device)
+    vad = pipeline({"waveform": waveform, "sample_rate": 16000})
+    segments = [
+        (segment.start, segment.end) for segment in vad.get_timeline().support()
+    ]
+    return segments
+def load_vad_model(
+    device,
+    vad_onset=0.500,
+    vad_offset=0.363,
+    use_auth_token=None,
+    model_fp=None,
+    batch_size=32,
+):
+    model_dir = torch.hub._get_torch_home()
+    os.makedirs(model_dir, exist_ok=True)
+    if model_fp is None:
+        model_fp = os.path.join(model_dir, "whisperx-vad-segmentation.bin")
+    if os.path.exists(model_fp) and not os.path.isfile(model_fp):
+        raise RuntimeError(f"{model_fp} exists and is not a regular file")
+    if not os.path.isfile(model_fp):
+        with (
+            urllib.request.urlopen(VAD_SEGMENTATION_URL) as source,
+            open(model_fp, "wb") as output,
+        ):
+            with tqdm(
+                total=int(source.info().get("Content-Length")),
+                ncols=80,
+                unit="iB",
+                unit_scale=True,
+                unit_divisor=1024,
+            ) as loop:
+                while True:
+                    buffer = source.read(8192)
+                    if not buffer:
+                        break
+                    output.write(buffer)
+                    loop.update(len(buffer))
+    model_bytes = open(model_fp, "rb").read()
+    if hashlib.sha256(model_bytes).hexdigest() != VAD_SEGMENTATION_URL.split("/")[-2]:
+        raise RuntimeError(
+            "Model has been downloaded but the SHA256 checksum does not not match. Please retry loading the model."
+        )
+    vad_model = Model.from_pretrained(model_fp, use_auth_token=use_auth_token)
+    hyperparameters = {
+        "onset": vad_onset,
+        "offset": vad_offset,
+        "min_duration_on": 0.1,
+        "min_duration_off": 0.1,
+    }
+    vad_pipeline = VoiceActivitySegmentation(
+        segmentation=vad_model, device=torch.device(device), batch_size=batch_size
+    )
+    vad_pipeline.instantiate(hyperparameters)
+    return vad_pipeline
+class Binarize:
+    """Binarize detection scores using hysteresis thresholding, with min-cut operation
+    to ensure not segments are longer than max_duration.
+    Parameters
+    ----------
+    onset : float, optional
+        Onset threshold. Defaults to 0.5.
+    offset : float, optional
+        Offset threshold. Defaults to `onset`.
+    min_duration_on : float, optional
+        Remove active regions shorter than that many seconds. Defaults to 0s.
+    min_duration_off : float, optional
+        Fill inactive regions shorter than that many seconds. Defaults to 0s.
+    pad_onset : float, optional
+        Extend active regions by moving their start time by that many seconds.
+        Defaults to 0s.
+    pad_offset : float, optional
+        Extend active regions by moving their end time by that many seconds.
+        Defaults to 0s.
+    max_duration: float
+        The maximum length of an active segment, divides segment at timestamp with lowest score.
+    Reference
+    ---------
+    Gregory Gelly and Jean-Luc Gauvain. "Minimum Word Error Training of
+    RNN-based Voice Activity Detection", InterSpeech 2015.
+    Modified by Max Bain to include WhisperX's min-cut operation
+    https://arxiv.org/abs/2303.00747
+    Pyannote-audio
+    """
+    def __init__(
+        self,
+        onset: float = 0.5,
+        offset: float | None = None,
+        min_duration_on: float = 0.0,
+        min_duration_off: float = 0.0,
+        pad_onset: float = 0.0,
+        pad_offset: float = 0.0,
+        max_duration: float = float("inf"),
+    ):
+        super().__init__()
+        self.onset = onset
+        self.offset = offset or onset
+        self.pad_onset = pad_onset
+        self.pad_offset = pad_offset
+        self.min_duration_on = min_duration_on
+        self.min_duration_off = min_duration_off
+        self.max_duration = max_duration
+    def __call__(self, scores: SlidingWindowFeature) -> Annotation:
+        """Binarize detection scores
+        Parameters
+        ----------
+        scores : SlidingWindowFeature
+            Detection scores.
+        Returns
+        -------
+        active : Annotation
+            Binarized scores.
+        """
+        num_frames, num_classes = scores.data.shape
+        frames = scores.sliding_window
+        timestamps = [frames[i].middle for i in range(num_frames)]
+        # annotation meant to store 'active' regions
+        active = Annotation()
+        for k, k_scores in enumerate(scores.data.T):
+            label = k if scores.labels is None else scores.labels[k]
+            # initial state
+            start = timestamps[0]
+            is_active = k_scores[0] > self.onset
+            curr_scores = [k_scores[0]]
+            curr_timestamps = [start]
+            t = start
+            for t, y in zip(timestamps[1:], k_scores[1:], strict=False):
+                # currently active
+                if is_active:
+                    curr_duration = t - start
+                    if curr_duration > self.max_duration:
+                        search_after = len(curr_scores) // 2
+                        # divide segment
+                        min_score_div_idx = search_after + np.argmin(
+                            curr_scores[search_after:]
+                        )
+                        min_score_t = curr_timestamps[min_score_div_idx]
+                        region = Segment(
+                            start - self.pad_onset, min_score_t + self.pad_offset
+                        )
+                        active[region, k] = label
+                        start = curr_timestamps[min_score_div_idx]
+                        curr_scores = curr_scores[min_score_div_idx + 1 :]
+                        curr_timestamps = curr_timestamps[min_score_div_idx + 1 :]
+                    # switching from active to inactive
+                    elif y < self.offset:
+                        region = Segment(start - self.pad_onset, t + self.pad_offset)
+                        active[region, k] = label
+                        start = t
+                        is_active = False
+                        curr_scores = []
+                        curr_timestamps = []
+                    curr_scores.append(y)
+                    curr_timestamps.append(t)
+                # currently inactive
+                else:
+                    # switching from inactive to active
+                    if y > self.onset:
+                        start = t
+                        is_active = True
+            # if active at the end, add final region
+            if is_active:
+                region = Segment(start - self.pad_onset, t + self.pad_offset)
+                active[region, k] = label
+        # because of padding, some active regions might be overlapping: merge them.
+        # also: fill same speaker gaps shorter than min_duration_off
+        if self.pad_offset > 0.0 or self.pad_onset > 0.0 or self.min_duration_off > 0.0:
+            if self.max_duration < float("inf"):
+                raise NotImplementedError("This would break current max_duration param")
+            active = active.support(collar=self.min_duration_off)
+        # remove tracks shorter than min_duration_on
+        if self.min_duration_on > 0:
+            for segment, track in list(active.itertracks()):
+                if segment.duration < self.min_duration_on:
+                    del active[segment, track]
+        return active
+class VoiceActivitySegmentation(VoiceActivityDetection):
+    def __init__(
+        self,
+        segmentation: PipelineModel = "pyannote/segmentation",
+        fscore: bool = False,
+        use_auth_token: str | None = None,
+        **inference_kwargs,
+    ):
+        super().__init__(
+            segmentation=segmentation,
+            fscore=fscore,
+            use_auth_token=use_auth_token,
+            **inference_kwargs,
+        )
+    def apply(self, file: AudioFile, hook: Callable | None = None) -> Annotation:
+        """Apply voice activity detection
+        Parameters
+        ----------
+        file : AudioFile
+            Processed file.
+        hook : callable, optional
+            Hook called after each major step of the pipeline with the following
+            signature: hook("step_name", step_artefact, file=file)
+        Returns
+        -------
+        speech : Annotation
+            Speech regions.
+        """
+        # setup hook (e.g. for debugging purposes)
+        hook = self.setup_hook(file, hook=hook)
+        # apply segmentation model (only if needed)
+        # output shape is (num_chunks, num_frames, 1)
+        if self.training:
+            if self.CACHED_SEGMENTATION in file:
+                segmentations = file[self.CACHED_SEGMENTATION]
+            else:
+                segmentations = self._segmentation(file)
+                file[self.CACHED_SEGMENTATION] = segmentations
+        else:
+            segmentations: SlidingWindowFeature = self._segmentation(file)
+        return segmentations
+def merge_vad(
+    vad_arr, pad_onset=0.0, pad_offset=0.0, min_duration_off=0.0, min_duration_on=0.0
+):
+    active = Annotation()
+    for k, vad_t in enumerate(vad_arr):
+        region = Segment(vad_t[0] - pad_onset, vad_t[1] + pad_offset)
+        active[region, k] = 1
+    if pad_offset > 0.0 or pad_onset > 0.0 or min_duration_off > 0.0:
+        active = active.support(collar=min_duration_off)
+    # remove tracks shorter than min_duration_on
+    if min_duration_on > 0:
+        for segment, track in list(active.itertracks()):
+            if segment.duration < min_duration_on:
+                del active[segment, track]
+    active = active.for_json()
+    active_segs = pd.DataFrame([x["segment"] for x in active["content"]])
+    return active_segs
+def merge_chunks(
+    segments,
+    chunk_size,
+    onset: float = 0.5,
+    offset: float | None = None,
+):
+    """
+    Merge operation described in paper
+    """
+    curr_end = 0
+    merged_segments = []
+    seg_idxs = []
+    assert chunk_size > 0
+    binarize = Binarize(max_duration=chunk_size, onset=onset, offset=offset)
+    segments = binarize(segments)
+    segments_list = []
+    for speech_turn in segments.get_timeline():
+        segments_list.append(Segment(speech_turn.start, speech_turn.end))
+    if len(segments_list) == 0:
+        print("No active speech found in audio")
+        return []
+    # assert segments_list, "segments_list is empty."
+    # Make sur the starting point is the start of the segment.
+    curr_start = segments_list[0].start
+    for seg in segments_list:
+        if seg.end - curr_start > chunk_size and curr_end - curr_start > 0:
+            merged_segments.append(
+                {
+                    "start": curr_start,
+                    "end": curr_end,
+                }
+            )
+            curr_start = seg.start
+            seg_idxs = []
+        curr_end = seg.end
+        seg_idxs.append((seg.start, seg.end))
+    merged_segments.append(
+        {
+            "start": curr_start,
+            "end": curr_end,
+        }
+    )
+    return merged_segments