Spaces:

nari-labs
/

Dia-1.6B

Running on Zero

App Files Files Community

buttercrab commited on 11 days ago

Commit

e43723b

unverified ·

1 Parent(s): 0f25f79

update

Browse files

Files changed (8) hide show

app.py +2 -2
dia/__init__.py +6 -0
dia/audio.py +0 -22
dia/config.py +96 -113
dia/layers.py +411 -127
dia/model.py +559 -168
dia/state.py +82 -69
requirements.txt +3 -2

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ from dia.model import Dia
 print("Loading Nari model...")
 try:
     # Use the function from inference.py
-    model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float32")
 except Exception as e:
     print(f"Error loading Nari model: {e}")
     raise
@@ -375,4 +375,4 @@ if __name__ == "__main__":
     # set `GRADIO_SERVER_NAME`, `GRADIO_SERVER_PORT` env vars to override default values
     # use `GRADIO_SERVER_NAME=0.0.0.0` for Docker
-    demo.launch()

 print("Loading Nari model...")
 try:
     # Use the function from inference.py
+    model = Dia.from_pretrained("nari-labs/Dia-1.6B-0626", compute_dtype="float16")
 except Exception as e:
     print(f"Error loading Nari model: {e}")
     raise
     # set `GRADIO_SERVER_NAME`, `GRADIO_SERVER_PORT` env vars to override default values
     # use `GRADIO_SERVER_NAME=0.0.0.0` for Docker
+    demo.launch()

dia/__init__.py CHANGED Viewed

	@@ -0,0 +1,6 @@

+from .model import Dia
+__all__ = [
+    "Dia",
+]

dia/audio.py CHANGED Viewed

@@ -179,25 +179,3 @@ def revert_audio_delay(
     )  # Changed np.where to torch.where
     return result_BxTxC
-@torch.no_grad()
-@torch.inference_mode()
-def decode(
-    model,
-    audio_codes,
-):
-    """
-    Decodes the given frames into an output audio waveform
-    """
-    if len(audio_codes) != 1:
-        raise ValueError(f"Expected one frame, got {len(audio_codes)}")
-    try:
-        audio_values = model.quantizer.from_codes(audio_codes)
-        audio_values = model.decode(audio_values[0])
-        return audio_values
-    except Exception as e:
-        print(f"Error in decode method: {str(e)}")
-        raise


179	) # Changed np.where to torch.where
180
181	return result_BxTxC

dia/config.py CHANGED Viewed

@@ -14,149 +14,132 @@ Key components:
 """
 import os
-from typing import Annotated
-from pydantic import BaseModel, BeforeValidator, Field
-class DataConfig(BaseModel, frozen=True):
-    """Configuration for data loading and preprocessing.
-    Attributes:
-        text_length: Maximum length of text sequences (must be multiple of 128).
-        audio_length: Maximum length of audio sequences (must be multiple of 128).
-        channels: Number of audio channels.
-        text_pad_value: Value used for padding text sequences.
-        audio_eos_value: Value representing the end of audio sequences.
-        audio_bos_value: Value representing the beginning of audio sequences.
-        audio_pad_value: Value used for padding audio sequences.
-        delay_pattern: List of delay values for each audio channel.
-    """
-    text_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = (
-        Field(gt=0, multiple_of=128)
-    )
-    audio_length: Annotated[int, BeforeValidator(lambda x: (x + 127) // 128 * 128)] = (
-        Field(gt=0, multiple_of=128)
-    )
-    channels: int = Field(default=9, gt=0, multiple_of=1)
-    text_pad_value: int = Field(default=0)
-    audio_eos_value: int = Field(default=1024)
-    audio_pad_value: int = Field(default=1025)
-    audio_bos_value: int = Field(default=1026)
-    delay_pattern: list[Annotated[int, Field(ge=0)]] = Field(
-        default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15]
-    )
-    def __hash__(self) -> int:
-        """Generate a hash based on all fields of the config."""
-        return hash(
-            (
-                self.text_length,
-                self.audio_length,
-                self.channels,
-                self.text_pad_value,
-                self.audio_pad_value,
-                self.audio_bos_value,
-                self.audio_eos_value,
-                tuple(self.delay_pattern),
-            )
-        )
 class EncoderConfig(BaseModel, frozen=True):
     """Configuration for the encoder component of the Dia model.
     Attributes:
-        n_layer: Number of transformer layers.
-        n_embd: Embedding dimension.
-        n_hidden: Hidden dimension size in the MLP layers.
-        n_head: Number of attention heads.
-        head_dim: Dimension per attention head.
     """
-    n_layer: int = Field(gt=0)
-    n_embd: int = Field(gt=0)
-    n_hidden: int = Field(gt=0)
-    n_head: int = Field(gt=0)
-    head_dim: int = Field(gt=0)
 class DecoderConfig(BaseModel, frozen=True):
     """Configuration for the decoder component of the Dia model.
     Attributes:
-        n_layer: Number of transformer layers.
-        n_embd: Embedding dimension.
-        n_hidden: Hidden dimension size in the MLP layers.
-        gqa_query_heads: Number of query heads for grouped-query self-attention.
-        kv_heads: Number of key/value heads for grouped-query self-attention.
-        gqa_head_dim: Dimension per query head for grouped-query self-attention.
-        cross_query_heads: Number of query heads for cross-attention.
-        cross_head_dim: Dimension per cross-attention head.
     """
-    n_layer: int = Field(gt=0)
-    n_embd: int = Field(gt=0)
-    n_hidden: int = Field(gt=0)
-    gqa_query_heads: int = Field(gt=0)
-    kv_heads: int = Field(gt=0)
-    gqa_head_dim: int = Field(gt=0)
-    cross_query_heads: int = Field(gt=0)
-    cross_head_dim: int = Field(gt=0)
-class ModelConfig(BaseModel, frozen=True):
     """Main configuration container for the Dia model architecture.
     Attributes:
         encoder: Configuration for the encoder component.
         decoder: Configuration for the decoder component.
         src_vocab_size: Size of the source (text) vocabulary.
         tgt_vocab_size: Size of the target (audio code) vocabulary.
-        dropout: Dropout probability applied within the model.
-        normalization_layer_epsilon: Epsilon value for normalization layers (e.g., LayerNorm).
-        weight_dtype: Data type for model weights (e.g., "float32", "bfloat16").
-        rope_min_timescale: Minimum timescale for Rotary Positional Embeddings (RoPE).
-        rope_max_timescale: Maximum timescale for Rotary Positional Embeddings (RoPE).
     """
-    encoder: EncoderConfig
-    decoder: DecoderConfig
-    src_vocab_size: int = Field(default=128, gt=0)
-    tgt_vocab_size: int = Field(default=1028, gt=0)
-    dropout: float = Field(default=0.0, ge=0.0, lt=1.0)
-    normalization_layer_epsilon: float = Field(default=1.0e-5, ge=0.0)
-    weight_dtype: str = Field(default="float32", description="Weight precision")
-    rope_min_timescale: int = Field(
-        default=1, description="Timescale For global Attention"
     )
-    rope_max_timescale: int = Field(
-        default=10_000, description="Timescale For global Attention"
     )
-class TrainingConfig(BaseModel, frozen=True):
-    pass
-class DiaConfig(BaseModel, frozen=True):
-    """Master configuration for the Dia model.
-    Combines all sub-configurations into a single validated object.
-    Attributes:
-        version: Configuration version string.
-        model: Model architecture configuration.
-        training: Training process configuration (precision settings).
-        data: Data loading and processing configuration.
-    """
-    version: str = Field(default="1.0")
-    model: ModelConfig
-    # TODO: remove training. this is just for backwards-compatability
-    training: TrainingConfig
-    data: DataConfig
     def save(self, path: str) -> None:
         """Save the current configuration instance to a JSON file.

 """
 import os
+from pydantic import BaseModel, Field
 class EncoderConfig(BaseModel, frozen=True):
     """Configuration for the encoder component of the Dia model.
     Attributes:
+        model_type: Type of the model, defaults to "dia_encoder".
+        hidden_size: Size of the encoder layers, defaults to 1024.
+        intermediate_size: Size of the "intermediate" (i.e., feed-forward) layer in the encoder, defaults to 4096.
+        num_hidden_layers: Number of hidden layers in the encoder, defaults to 12.
+        num_attention_heads: Number of attention heads in the encoder, defaults to 16.
+        num_key_value_heads: Number of key-value heads in the encoder, defaults to 16.
+        head_dim: Dimension of each attention head, defaults to 128.
+        hidden_act: Activation function in the encoder, defaults to "silu".
+        max_position_embeddings: Maximum number of position embeddings, defaults to 1024.
+        initializer_range: Range for initializing weights, defaults to 0.02.
+        norm_eps: Epsilon value for normalization layers, defaults to 1e-5.
+        rope_theta: Theta value for RoPE, defaults to 10000.0.
+        rope_scaling: Optional scaling factor for RoPE.
+        vocab_size: Vocabulary size, defaults to 256.
     """
+    head_dim: int = Field(default=128, gt=0)
+    hidden_act: str = Field(default="silu")
+    hidden_size: int = Field(default=1024, gt=0)
+    initializer_range: float = Field(default=0.02)
+    intermediate_size: int = Field(default=4096, gt=0)
+    max_position_embeddings: int = Field(default=1024, gt=0)
+    model_type: str = Field(default="dia_encoder")
+    norm_eps: float = Field(default=1e-5)
+    num_attention_heads: int = Field(default=16, gt=0)
+    num_hidden_layers: int = Field(default=12, gt=0)
+    num_key_value_heads: int = Field(default=16, gt=0)
+    rope_scaling: float | None = Field(default=None)
+    rope_theta: float = Field(default=10000.0)
+    vocab_size: int = Field(default=256, gt=0)
 class DecoderConfig(BaseModel, frozen=True):
     """Configuration for the decoder component of the Dia model.
     Attributes:
+        model_type: Type of the model, defaults to "dia_decoder".
+        hidden_size: Size of the decoder layers, defaults to 2048.
+        intermediate_size: Size of the "intermediate" (i.e., feed-forward) layer in the decoder, defaults to 8192.
+        num_hidden_layers: Number of hidden layers in the decoder, defaults to 18.
+        num_attention_heads: Number of attention heads in the decoder, defaults to 16.
+        num_key_value_heads: Number of key-value heads in the decoder, defaults to 4.
+        head_dim: Dimension of each attention head, defaults to 128.
+        cross_hidden_size: Size of the cross-attention layers, defaults to 1024.
+        cross_num_attention_heads: Number of attention heads in the cross-attention mechanism, defaults to 16.
+        cross_num_key_value_heads: Number of key-value heads in the cross-attention mechanism, defaults to 16.
+        cross_head_dim: Dimension of each cross-attention head, defaults to 128.
+        hidden_act: Activation function in the decoder, defaults to "silu".
+        max_position_embeddings: Maximum number of position embeddings in the decoder, defaults to 3072.
+        initializer_range: Range for initializing weights in the decoder, defaults to 0.02.
+        norm_eps: Epsilon value for normalization layers in the decoder, defaults to 1e-5.
+        rope_theta: Theta value for RoPE in the decoder, defaults to 10000.0.
+        rope_scaling: Optional scaling factor for RoPE in the decoder.
+        vocab_size: Vocabulary size for the decoder, defaults to 1028.
+        num_channels: Number of channels in the decoder, defaults to 9.
     """
+    cross_head_dim: int = Field(default=128, gt=0)
+    cross_hidden_size: int = Field(default=1024, gt=0)
+    cross_num_attention_heads: int = Field(default=16, gt=0)
+    cross_num_key_value_heads: int = Field(default=16, gt=0)
+    head_dim: int = Field(default=128, gt=0)
+    hidden_act: str = Field(default="silu")
+    hidden_size: int = Field(default=2048, gt=0)
+    initializer_range: float = Field(default=0.02)
+    intermediate_size: int = Field(default=8192, gt=0)
+    max_position_embeddings: int = Field(default=3072, gt=0)
+    model_type: str = Field(default="dia_decoder")
+    norm_eps: float = Field(default=1e-5)
+    num_attention_heads: int = Field(default=16, gt=0)
+    num_channels: int = Field(default=9, gt=0)
+    num_hidden_layers: int = Field(default=18, gt=0)
+    num_key_value_heads: int = Field(default=4, gt=0)
+    rope_scaling: float | None = Field(default=None)
+    rope_theta: float = Field(default=10000.0)
+    vocab_size: int = Field(default=1028, gt=0)
+class DiaConfig(BaseModel, frozen=True):
     """Main configuration container for the Dia model architecture.
     Attributes:
+        model_type: Type of the model, defaults to "dia".
+        is_encoder_decoder: Flag indicating if the model is an encoder-decoder type, defaults to True.
         encoder: Configuration for the encoder component.
         decoder: Configuration for the decoder component.
         src_vocab_size: Size of the source (text) vocabulary.
         tgt_vocab_size: Size of the target (audio code) vocabulary.
+        initializer_range: Range for initializing weights, defaults to 0.02.
+        norm_eps: Epsilon value for normalization layers, defaults to 1e-5.
+        torch_dtype: Data type for model weights in PyTorch, defaults to "float32".
+        bos_token_id: Beginning-of-sequence token ID, defaults to 1026.
+        eos_token_id: End-of-sequence token ID, defaults to 1024.
+        pad_token_id: Padding token ID, defaults to 1025.
+        rope_theta: Theta value for RoPE, defaults to 10000.0.
+        rope_scaling: Optional scaling factor for RoPE.
+        transformers_version: Version of the transformers library, defaults to "4.53.0.dev0".
+        architectures: List of model architectures, defaults to ["DiaForConditionalGeneration"].
+        delay_pattern: List of delay values for each audio channel, defaults to [0,8,9,10,11,12,13,14,15].
     """
+    architectures: list[str] = Field(
+        default_factory=lambda: ["DiaForConditionalGeneration"]
     )
+    bos_token_id: int = Field(default=1026)
+    decoder_config: DecoderConfig
+    delay_pattern: list[int] = Field(
+        default_factory=lambda: [0, 8, 9, 10, 11, 12, 13, 14, 15]
     )
+    encoder_config: EncoderConfig
+    eos_token_id: int = Field(default=1024)
+    initializer_range: float = Field(default=0.02)
+    is_encoder_decoder: bool = Field(default=True)
+    model_type: str = Field(default="dia")
+    norm_eps: float = Field(default=1e-5)
+    pad_token_id: int = Field(default=1025)
+    torch_dtype: str = Field(default="float32")
+    transformers_version: str = Field(default="4.53.0.dev0")
     def save(self, path: str) -> None:
         """Save the current configuration instance to a JSON file.

dia/layers.py CHANGED Viewed

@@ -1,10 +1,11 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from torch import Tensor
 from torch.nn import RMSNorm
-from .config import DiaConfig
 from .state import DecoderInferenceState, EncoderInferenceState, KVCache
@@ -15,12 +16,10 @@ def _normalize_axes(axes: tuple[int, ...], ndim: int) -> tuple[int, ...]:
 class DenseGeneral(nn.Module):
     """
     PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
     Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
     for the generalized matrix multiplication. Weight/bias shapes are calculated
     and parameters created during initialization based on config.
     `load_weights` validates shapes and copies data.
     Attributes:
         axis (Tuple[int, ...]): Input axis or axes to contract.
         in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
@@ -46,7 +45,6 @@ class DenseGeneral(nn.Module):
         factory_kwargs = {"device": device, "dtype": weight_dtype}
         self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
-        self.register_parameter("bias", None)
     def forward(self, inputs: Tensor) -> Tensor:
         norm_axis = _normalize_axes(self.axis, inputs.ndim)
@@ -112,53 +110,112 @@ class RotaryEmbedding(nn.Module):
         self.embedding_dims = embedding_dims
         self.min_timescale = min_timescale
         self.max_timescale = max_timescale
-        self.dtype = dtype
         half_embedding_dim = embedding_dims // 2
         fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
-        self.register_buffer(
-            "timescale",
-            self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction,
-            persistent=False,
-        )
-    def extra_repr(self) -> str:
-        s = f"{self.timescale.shape}"
-        return s
     def forward(self, inputs: torch.Tensor, position: torch.Tensor):
         """Applies RoPE."""
         position = position.unsqueeze(-1).unsqueeze(-1)
-        timescale = self.timescale.to(inputs.device)
-        sinusoid_inp = position / timescale
-        sin = torch.sin(sinusoid_inp).to(inputs.dtype)
-        cos = torch.cos(sinusoid_inp).to(inputs.dtype)
-        first_half, second_half = torch.chunk(inputs, 2, dim=-1)
         first_part = first_half * cos - second_half * sin
         second_part = second_half * cos + first_half * sin
-        return torch.cat((first_part, second_part), dim=-1)
-class Attention(nn.Module):
-    """Attention using DenseGeneral."""
     def __init__(
         self,
-        config: DiaConfig,
         q_embed_dim: int,
         kv_embed_dim: int,
         num_query_heads: int,
         num_kv_heads: int,
         head_dim: int,
         compute_dtype: torch.dtype,
-        is_cross_attn: bool = False,
         out_embed_dim: int | None = None,
     ):
         super().__init__()
         self.num_query_heads = num_query_heads
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
-        self.is_cross_attn = is_cross_attn
         self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
         self.projected_query_dim = num_query_heads * head_dim
         if num_query_heads % num_kv_heads != 0:
@@ -196,21 +253,18 @@ class Attention(nn.Module):
         # --- Rotary Embedding ---
         self.rotary_emb = RotaryEmbedding(
             embedding_dims=self.head_dim,
-            min_timescale=config.model.rope_min_timescale,
-            max_timescale=config.model.rope_max_timescale,
             dtype=compute_dtype,
         )
     def forward(
         self,
         Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
-        Xkv: torch.Tensor,  # (B, S, E) S = 1 in AR generation
         q_positions: torch.Tensor,  # (B, T)
         kv_positions: torch.Tensor | None = None,  # (B, S)
         attn_mask: torch.Tensor
         | None = None,  # None in Decoder Self Attention, Valid mask in Others
         cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
-        prefill: bool = False,
         is_causal: bool = False,
     ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         """
@@ -223,7 +277,6 @@ class Attention(nn.Module):
             kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
             attn_mask: Attention mask.
             cache: KVCache.
-            prefill: If True, use prefill mode.
         Returns:
             A tuple containing:
@@ -235,44 +288,266 @@ class Attention(nn.Module):
         original_dtype = Xq.dtype
         Xq_BxTxNxH = self.q_proj(Xq)
-        Xq_BxTxNxH = self.rotary_emb(Xq_BxTxNxH, position=q_positions)
         Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
         attn_k: torch.Tensor | None = None
         attn_v: torch.Tensor | None = None
-        if self.is_cross_attn:
-            attn_k, attn_v = cache.k, cache.v
         else:
-            Xk_BxSxKxH = self.k_proj(Xkv)  # (B, S, K, H)
-            Xv_BxSxKxH = self.v_proj(Xkv)  # (B, S, K, H)
-            Xk_BxSxKxH = self.rotary_emb(
-                Xk_BxSxKxH, position=kv_positions
-            )  # (B, S, K, H)
-            Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
-            Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
-            if cache is None:
-                attn_k = Xk_BxKxSxH
-                attn_v = Xv_BxKxSxH
-            else:
-                if prefill:
-                    attn_k, attn_v = Xk_BxKxSxH, Xv_BxKxSxH
-                    cache.prefill(attn_k, attn_v)
-                else:
-                    attn_k, attn_v = cache.update(Xk_BxKxSxH, Xv_BxKxSxH)
-        attn_output = F.scaled_dot_product_attention(
-            Xq_BxNxTxH,
-            attn_k,
-            attn_v,
-            attn_mask=attn_mask,
-            scale=1.0,
-            enable_gqa=self.num_gqa_groups > 1,
-            is_causal=is_causal,
         )
         attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
         output = self.o_proj(attn_output)
@@ -285,34 +560,33 @@ class EncoderLayer(nn.Module):
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
-        model_config = config.model
-        enc_config = config.model.encoder
-        embed_dim = enc_config.n_embd
         self.pre_sa_norm = RMSNorm(
             embed_dim,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
-        self.self_attention = Attention(
-            config,
             q_embed_dim=embed_dim,
             kv_embed_dim=embed_dim,
-            num_query_heads=enc_config.n_head,
-            num_kv_heads=enc_config.n_head,
             head_dim=enc_config.head_dim,
             compute_dtype=compute_dtype,
-            is_cross_attn=False,
             out_embed_dim=embed_dim,
         )
         self.post_sa_norm = RMSNorm(
             embed_dim,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         self.mlp = MlpBlock(
             embed_dim=embed_dim,
-            intermediate_dim=enc_config.n_hidden,
             compute_dtype=compute_dtype,
         )
@@ -322,10 +596,10 @@ class EncoderLayer(nn.Module):
         state: EncoderInferenceState,
     ) -> torch.Tensor:
         residual = x
-        x_norm = self.pre_sa_norm(x)
         sa_out = self.self_attention(
-            Xq=x_norm,
-            Xkv=x_norm,
             q_positions=state.positions,
             kv_positions=state.positions,
             attn_mask=state.attn_mask,
@@ -333,7 +607,7 @@ class EncoderLayer(nn.Module):
         x = residual + sa_out
         residual = x
-        x_norm = self.post_sa_norm(x)
         mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
@@ -346,20 +620,23 @@ class Encoder(nn.Module):
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
-        model_config = config.model
-        enc_config = config.model.encoder
         self.embedding = nn.Embedding(
-            model_config.src_vocab_size,
-            enc_config.n_embd,
             dtype=compute_dtype,
         )
         self.layers = nn.ModuleList(
-            [EncoderLayer(config, compute_dtype) for _ in range(enc_config.n_layer)]
         )
         self.norm = RMSNorm(
-            enc_config.n_embd,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
@@ -373,7 +650,7 @@ class Encoder(nn.Module):
         for layer in self.layers:
             x = layer(x, state)
-        x = self.norm(x)
         return x
@@ -383,57 +660,55 @@ class DecoderLayer(nn.Module):
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
-        model_config = config.model
-        dec_config = config.model.decoder
-        enc_config = config.model.encoder
-        dec_embed_dim = dec_config.n_embd
-        enc_embed_dim = enc_config.n_embd
         # Norms
         self.pre_sa_norm = RMSNorm(
             dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         self.pre_ca_norm = RMSNorm(
             dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         self.pre_mlp_norm = RMSNorm(
             dec_embed_dim,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         # Self-Attention (GQA) with Causal Masking
-        self.self_attention = Attention(
-            config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=dec_embed_dim,
-            num_query_heads=dec_config.gqa_query_heads,
-            num_kv_heads=dec_config.kv_heads,
-            head_dim=dec_config.gqa_head_dim,
             compute_dtype=compute_dtype,
-            is_cross_attn=False,
             out_embed_dim=dec_embed_dim,
         )
         # Cross-Attention (MHA)
-        self.cross_attention = Attention(
-            config=config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
-            num_query_heads=dec_config.cross_query_heads,
-            num_kv_heads=dec_config.cross_query_heads,
             head_dim=dec_config.cross_head_dim,
             compute_dtype=compute_dtype,
-            is_cross_attn=True,
             out_embed_dim=dec_embed_dim,
         )
         # MLP
         self.mlp = MlpBlock(
             embed_dim=dec_embed_dim,
-            intermediate_dim=dec_config.n_hidden,
             compute_dtype=compute_dtype,
         )
@@ -444,37 +719,39 @@ class DecoderLayer(nn.Module):
         self_attn_cache: KVCache | None = None,
         cross_attn_cache: KVCache | None = None,
         prefill: bool = False,
     ) -> torch.Tensor:
         residual = x
-        x_norm = self.pre_sa_norm(x)
         sa_out = self.self_attention(
-            Xq=x_norm,  # (2, 1, D)
-            Xkv=x_norm,  # (2, 1, D)
             q_positions=state.dec_positions,  # (2, 1)
             kv_positions=state.dec_positions,  # (2, 1)
-            attn_mask=None,
             cache=self_attn_cache,
             prefill=prefill,
             is_causal=prefill,
         )
         x = residual + sa_out
         residual = x
-        x_norm = self.pre_ca_norm(x)
         ca_out = self.cross_attention(
             Xq=x_norm,
-            Xkv=state.enc_out,
             q_positions=state.dec_positions,
             kv_positions=state.enc_positions,
-            attn_mask=state.dec_cross_attn_mask,
             cache=cross_attn_cache,
         )
         x = residual + ca_out
         residual = x
-        x_norm = self.pre_mlp_norm(x)
         mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
@@ -487,16 +764,14 @@ class Decoder(nn.Module):
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
-        model_config = config.model
-        dec_config = config.model.decoder
-        data_config = config.data
-        self.num_channels = data_config.channels
-        self.num_layers = dec_config.n_layer
         self.embeddings = nn.ModuleList(
             [
                 nn.Embedding(
-                    model_config.tgt_vocab_size, dec_config.n_embd, dtype=compute_dtype
                 )
                 for _ in range(self.num_channels)
             ]
@@ -509,14 +784,14 @@ class Decoder(nn.Module):
         )
         self.norm = RMSNorm(
-            dec_config.n_embd,
-            eps=model_config.normalization_layer_epsilon,
             dtype=torch.float32,
         )
         self.logits_dense = DenseGeneral(
-            in_shapes=(dec_config.n_embd,),
-            out_features=(self.num_channels, model_config.tgt_vocab_size),
             axis=(-1,),
             weight_dtype=compute_dtype,
         )
@@ -524,7 +799,6 @@ class Decoder(nn.Module):
     def precompute_cross_attn_cache(
         self,
         enc_out: torch.Tensor,  # (B, S, E)
-        enc_positions: torch.Tensor,  # (B, S)
     ) -> list[KVCache]:
         """
         Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
@@ -536,7 +810,6 @@ class Decoder(nn.Module):
             k_proj = cross_attn_module.k_proj(enc_out)
             v_proj = cross_attn_module.v_proj(enc_out)
-            k_proj = cross_attn_module.rotary_emb(k_proj, position=enc_positions)
             k = k_proj.transpose(1, 2)
             v = v_proj.transpose(1, 2)
@@ -548,10 +821,10 @@ class Decoder(nn.Module):
         self,
         tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
         state: DecoderInferenceState,
     ) -> torch.Tensor:
         """
         Performs a single decoding step, managing KV caches layer by layer.
         Returns:
             A tuple containing:
             - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
@@ -571,6 +844,7 @@ class Decoder(nn.Module):
                 state,
                 self_attn_cache=self_cache,
                 cross_attn_cache=cross_cache,
             )
         x = self.norm(x)
@@ -583,7 +857,6 @@ class Decoder(nn.Module):
     ) -> torch.Tensor:
         """
         Forward pass for the Decoder stack, managing KV caches.
         Args:
             tgt_ids_BxTxC: Target token IDs (B, T, C).
             encoder_out: Output from the encoder (B, S, E).
@@ -597,7 +870,6 @@ class Decoder(nn.Module):
             precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
                                       derived from `encoder_out`. This is passed identically
                                       to all layers.
         Returns:
             A tuple containing:
             - logits: The final output logits (B, T, C * V), cast to float32.
@@ -632,7 +904,19 @@ class Decoder(nn.Module):
         return logits_BxTxCxV.to(torch.float32)
-class DiaModel(nn.Module):
     """PyTorch Dia Model using DenseGeneral."""
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):

 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from huggingface_hub import PyTorchModelHubMixin
 from torch import Tensor
 from torch.nn import RMSNorm
+from .config import DecoderConfig, DiaConfig, EncoderConfig
 from .state import DecoderInferenceState, EncoderInferenceState, KVCache
 class DenseGeneral(nn.Module):
     """
     PyTorch equivalent of flax.linen.DenseGeneral with shapes defined at init.
     Stores weights (`kernel`) in the same layout as Jax and uses torch.tensordot
     for the generalized matrix multiplication. Weight/bias shapes are calculated
     and parameters created during initialization based on config.
     `load_weights` validates shapes and copies data.
     Attributes:
         axis (Tuple[int, ...]): Input axis or axes to contract.
         in_shapes (Tuple[int, ...]): Sizes of the input dimensions specified by `axis`.
         factory_kwargs = {"device": device, "dtype": weight_dtype}
         self.weight = nn.Parameter(torch.empty(self.kernel_shape, **factory_kwargs))
     def forward(self, inputs: Tensor) -> Tensor:
         norm_axis = _normalize_axes(self.axis, inputs.ndim)
         self.embedding_dims = embedding_dims
         self.min_timescale = min_timescale
         self.max_timescale = max_timescale
+        self.compute_dtype = dtype
         half_embedding_dim = embedding_dims // 2
         fraction = (2.0 * torch.arange(0, half_embedding_dim)) / embedding_dims
+        timescale = (
+            self.min_timescale * (self.max_timescale / self.min_timescale) ** fraction
+        ).to(torch.float32)
+        self.register_buffer("timescale", timescale, persistent=False)
     def forward(self, inputs: torch.Tensor, position: torch.Tensor):
         """Applies RoPE."""
         position = position.unsqueeze(-1).unsqueeze(-1)
+        sinusoid_inp = position / self.timescale
+        sin = torch.sin(sinusoid_inp)
+        cos = torch.cos(sinusoid_inp)
+        first_half, second_half = torch.chunk(inputs.to(torch.float32), 2, dim=-1)
+        first_part = first_half * cos - second_half * sin
+        second_part = second_half * cos + first_half * sin
+        return torch.cat(
+            (first_part.to(self.compute_dtype), second_part.to(self.compute_dtype)),
+            dim=-1,
+        )
+    def apply_rope(self, inputs: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor):
+        first_half, second_half = torch.chunk(inputs.to(torch.float32), 2, dim=-1)
         first_part = first_half * cos - second_half * sin
         second_part = second_half * cos + first_half * sin
+        return torch.cat(
+            (first_part.to(self.compute_dtype), second_part.to(self.compute_dtype)),
+            dim=-1,
+        )
+def custom_scaled_dot_product_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: torch.Tensor | None = None,
+    scale: float = 1.0,
+    is_causal: bool = False,
+    num_gqa_groups: int = 1,
+) -> torch.Tensor:
+    """
+    Custom scaled dot-product attention with GQA support for MPS compatibility.
+    Args:
+        query: (B, N_q, T, H) - Query tensor, N_q = num_query_heads
+        key: (B, N_kv, S, H) - Key tensor, N_kv = num_kv_heads
+        value: (B, N_kv, S, H) - Value tensor
+        attn_mask: (B, 1, T, S) - Attention mask, optional
+        scale: Scaling factor for attention scores
+        is_causal: If True, apply causal masking
+        num_gqa_groups: Number of query groups per KV head (N_q / N_kv)
+    Returns:
+        output: (B, N_q, T, H) - Attention output
+    """
+    B, N_q, T, H = query.shape
+    _, N_kv, S, _ = key.shape
+    # For GQA, repeat key and value tensors to match query heads
+    if num_gqa_groups > 1:
+        key = key.repeat_interleave(num_gqa_groups, dim=1)  # (B, N_q, S, H)
+        value = value.repeat_interleave(num_gqa_groups, dim=1)  # (B, N_q, S, H)
+    # Compute attention scores: (B, N_q, T, H) @ (B, N_q, H, S) -> (B, N_q, T, S)
+    scores = torch.matmul(query, key.transpose(-1, -2)) * scale
+    # Apply causal mask if needed
+    if is_causal:
+        causal_mask = torch.tril(
+            torch.ones(T, S, dtype=torch.bool, device=query.device)
+        )
+        scores = scores.masked_fill(~causal_mask, float("-inf"))
+    # Apply attention mask if provided
+    if attn_mask is not None:
+        scores = scores.masked_fill(~attn_mask, float("-inf"))
+    # Softmax over the last dimension (S)
+    attn_weights = F.softmax(scores, dim=-1)
+    # Compute output: (B, N_q, T, S) @ (B, N_q, S, H) -> (B, N_q, T, H)
+    output = torch.matmul(attn_weights, value)
+    return output
+class CrossAttention(nn.Module):
+    """Cross-Attention using DenseGeneral."""
     def __init__(
         self,
+        config: EncoderConfig | DecoderConfig,
         q_embed_dim: int,
         kv_embed_dim: int,
         num_query_heads: int,
         num_kv_heads: int,
         head_dim: int,
         compute_dtype: torch.dtype,
         out_embed_dim: int | None = None,
     ):
         super().__init__()
         self.num_query_heads = num_query_heads
         self.num_kv_heads = num_kv_heads
         self.head_dim = head_dim
         self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
         self.projected_query_dim = num_query_heads * head_dim
         if num_query_heads % num_kv_heads != 0:
         # --- Rotary Embedding ---
         self.rotary_emb = RotaryEmbedding(
             embedding_dims=self.head_dim,
+            max_timescale=config.rope_theta,
             dtype=compute_dtype,
         )
     def forward(
         self,
         Xq: torch.Tensor,  # (B, T, D) T = 1 in AR generation
         q_positions: torch.Tensor,  # (B, T)
         kv_positions: torch.Tensor | None = None,  # (B, S)
         attn_mask: torch.Tensor
         | None = None,  # None in Decoder Self Attention, Valid mask in Others
         cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
         is_causal: bool = False,
     ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
         """
             kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
             attn_mask: Attention mask.
             cache: KVCache.
         Returns:
             A tuple containing:
         original_dtype = Xq.dtype
         Xq_BxTxNxH = self.q_proj(Xq)
         Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
         attn_k: torch.Tensor | None = None
         attn_v: torch.Tensor | None = None
+        attn_k, attn_v = cache.k, cache.v
+        # Use custom attention for MPS backend, otherwise use optimized PyTorch function
+        is_mps = Xq.device.type == "mps" and torch.backends.mps.is_available()
+        if is_mps:
+            attn_output = custom_scaled_dot_product_attention(
+                query=Xq_BxNxTxH,
+                key=attn_k,
+                value=attn_v,
+                attn_mask=attn_mask if not is_causal else None,
+                scale=1.0,
+                is_causal=is_causal,
+                num_gqa_groups=self.num_gqa_groups,
+            )
         else:
+            attn_output = F.scaled_dot_product_attention(
+                Xq_BxNxTxH,
+                attn_k,
+                attn_v,
+                attn_mask=attn_mask if not is_causal else None,
+                scale=1.0,
+                enable_gqa=self.num_gqa_groups > 1,
+                is_causal=is_causal,
+            )
+        attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
+        output = self.o_proj(attn_output)
+        return output.to(original_dtype)
+class FusedQKV(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        bias: bool = False,
+        num_q_heads: int = 1,
+        q_head_dim: int = 1,
+        num_kv_heads: int = 1,
+        kv_head_dim: int = 1,
+    ):
+        super().__init__()
+        self.num_q_heads = num_q_heads
+        self.q_head_dim = q_head_dim
+        self.num_kv_heads = num_kv_heads
+        self.kv_head_dim = kv_head_dim
+        self.q_output_dim = num_q_heads * q_head_dim
+        self.kv_output_dim = num_kv_heads * kv_head_dim
+        self.linear = nn.Linear(in_features, out_features, bias=bias)
+    def forward(
+        self, inputs: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        x = self.linear(inputs)
+        q, k, v = x.split(
+            [self.q_output_dim, self.kv_output_dim, self.kv_output_dim], dim=-1
         )
+        q = q.reshape(q.shape[:-1] + (self.num_q_heads, self.q_head_dim))
+        k = k.reshape(k.shape[:-1] + (self.num_kv_heads, self.kv_head_dim))
+        v = v.reshape(v.shape[:-1] + (self.num_kv_heads, self.kv_head_dim))
+        return q, k, v
+class SelfAttention(nn.Module):
+    """Attention using DenseGeneral."""
+    def __init__(
+        self,
+        config: DiaConfig,
+        q_embed_dim: int,
+        kv_embed_dim: int,
+        num_query_heads: int,
+        num_kv_heads: int,
+        head_dim: int,
+        compute_dtype: torch.dtype,
+        out_embed_dim: int | None = None,
+    ):
+        super().__init__()
+        self.num_query_heads = num_query_heads
+        self.num_kv_heads = num_kv_heads
+        self.head_dim = head_dim
+        self.output_dim = out_embed_dim if out_embed_dim is not None else q_embed_dim
+        self.projected_query_dim = num_query_heads * head_dim
+        if num_query_heads % num_kv_heads != 0:
+            raise ValueError(
+                f"num_query_heads ({num_query_heads}) must be divisible by num_kv_heads ({num_kv_heads})"
+            )
+        self.num_gqa_groups = num_query_heads // num_kv_heads
+        self.kv_embed_dim = kv_embed_dim
+        self.q_embed_dim = q_embed_dim
+        # --- Projection Layers using DenseGeneral ---
+        self.q_proj = DenseGeneral(
+            in_shapes=(q_embed_dim,),
+            out_features=(num_query_heads, head_dim),
+            axis=(-1,),
+            weight_dtype=compute_dtype,
+        )
+        self.k_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            weight_dtype=compute_dtype,
+        )
+        self.v_proj = DenseGeneral(
+            in_shapes=(kv_embed_dim,),
+            out_features=(num_kv_heads, head_dim),
+            axis=(-1,),
+            weight_dtype=compute_dtype,
+        )
+        self.o_proj = DenseGeneral(
+            in_shapes=(num_query_heads, head_dim),
+            out_features=(self.output_dim,),
+            axis=(-2, -1),
+            weight_dtype=compute_dtype,
+        )
+        # --- Rotary Embedding ---
+        self.rotary_emb = RotaryEmbedding(
+            embedding_dims=self.head_dim,
+            max_timescale=config.rope_theta,
+            dtype=compute_dtype,
+        )
+        self.is_fused_qkv = False
+    def get_linear_weight(self, dense: DenseGeneral):
+        W_dg = dense.weight.data
+        out_features = 1
+        input_features = 1
+        for dim in dense.out_features:
+            out_features *= dim
+        for dim in dense.in_shapes:
+            input_features *= dim
+        W_dg_reshaped_for_linear_T = W_dg.reshape(input_features, out_features)
+        linear_weight = W_dg_reshaped_for_linear_T.transpose(0, 1).contiguous()
+        return linear_weight
+    def patch_fused_qkv(self):
+        q_proj_weight = self.get_linear_weight(self.q_proj)
+        k_proj_weight = self.get_linear_weight(self.k_proj)
+        v_proj_weight = self.get_linear_weight(self.v_proj)
+        self.qkv = FusedQKV(
+            self.kv_embed_dim,
+            (
+                self.num_query_heads * self.head_dim
+                + 2 * (self.num_kv_heads * self.head_dim)
+            ),
+            bias=False,
+            num_q_heads=self.num_query_heads,
+            q_head_dim=self.head_dim,
+            num_kv_heads=self.num_kv_heads,
+            kv_head_dim=self.head_dim,
+        )
+        self.qkv.linear.weight.data = torch.cat(
+            [q_proj_weight, k_proj_weight, v_proj_weight], dim=0
+        )
+        # print(f"qkv.weight.shape: {self.qkv.linear.weight.shape}")
+        self.is_fused_qkv = True
+    def forward(
+        self,
+        X: torch.Tensor,  # (B, T, D) T = 1 in AR generation
+        q_positions: torch.Tensor,  # (B, T)
+        kv_positions: torch.Tensor | None = None,  # (B, S)
+        attn_mask: torch.Tensor
+        | None = None,  # None in Decoder Self Attention, Valid mask in Others
+        cache: KVCache | None = None,  # None in Encoder, KVCache in Decoder
+        prefill: bool = False,
+        is_causal: bool = False,
+        current_idx: torch.Tensor | None = None,
+    ) -> tuple[torch.Tensor, tuple[torch.Tensor, torch.Tensor] | None]:
+        """
+        Performs attention calculation with optional KV caching.
+        Args:
+            Xq: Query tensor (B, T, D). T=1 during single-step decoding.
+            Xkv: Key/Value source tensor (B, S, E). S=1 during single-step decoding for self-attn.
+            q_positions: Positions for queries (B, T).
+            kv_positions: Positions for keys/values (B, S). If None, uses q_positions.
+            attn_mask: Attention mask.
+            cache: KVCache.
+            prefill: If True, use prefill mode.
+        Returns:
+            A tuple containing:
+            - output: The attention output tensor (B, T, output_dim).
+            - present_kv: The K/V state to be cached for the next step ((B, N, S_new, H), (B, N, S_new, H)). For self-attn, S_new = S_past + S. For cross-attn, S_new = S_kv.
+        """
+        if kv_positions is None:
+            kv_positions = q_positions
+        original_dtype = X.dtype
+        if self.is_fused_qkv:
+            Xq_BxTxNxH, Xk_BxSxKxH, Xv_BxSxKxH = self.qkv(X)
+        else:
+            Xq_BxTxNxH = self.q_proj(X)
+            Xk_BxSxKxH = self.k_proj(X)
+            Xv_BxSxKxH = self.v_proj(X)
+        position = q_positions.unsqueeze(-1).unsqueeze(-1)
+        sinusoid_inp = position / self.rotary_emb.timescale
+        sin = torch.sin(sinusoid_inp)
+        cos = torch.cos(sinusoid_inp)
+        Xq_BxTxNxH = self.rotary_emb.apply_rope(Xq_BxTxNxH, sin, cos)
+        Xk_BxSxKxH = self.rotary_emb.apply_rope(Xk_BxSxKxH, sin, cos)
+        Xq_BxNxTxH = Xq_BxTxNxH.transpose(1, 2)
+        attn_k: torch.Tensor | None = None
+        attn_v: torch.Tensor | None = None
+        Xk_BxKxSxH = Xk_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+        Xv_BxKxSxH = Xv_BxSxKxH.transpose(1, 2)  # (B, K, S, H)
+        if cache is None:
+            attn_k = Xk_BxKxSxH
+            attn_v = Xv_BxKxSxH
+        elif prefill:
+            attn_k, attn_v = Xk_BxKxSxH, Xv_BxKxSxH
+            cache.prefill(attn_k, attn_v)
+        else:
+            attn_k, attn_v = cache.update(Xk_BxKxSxH, Xv_BxKxSxH, current_idx)
+        # Use custom attention for MPS backend, otherwise use optimized PyTorch function
+        is_mps = Xv_BxSxKxH.device.type == "mps" and torch.backends.mps.is_available()
+        if is_mps:
+            attn_output = custom_scaled_dot_product_attention(
+                query=Xq_BxNxTxH,
+                key=attn_k,
+                value=attn_v,
+                attn_mask=attn_mask if not is_causal else None,
+                scale=1.0,
+                is_causal=is_causal,
+                num_gqa_groups=self.num_gqa_groups,
+            )
+        else:
+            attn_output = F.scaled_dot_product_attention(
+                Xq_BxNxTxH,
+                attn_k,
+                attn_v,
+                attn_mask=attn_mask if not is_causal else None,
+                scale=1.0,
+                enable_gqa=self.num_gqa_groups > 1,
+                is_causal=is_causal,
+            )
         attn_output = attn_output.transpose(1, 2).contiguous()  # (B, T, N, H)
         output = self.o_proj(attn_output)
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
+        enc_config = config.encoder_config
+        embed_dim = enc_config.hidden_size
+        self.compute_dtype = compute_dtype
         self.pre_sa_norm = RMSNorm(
             embed_dim,
+            eps=enc_config.norm_eps,
             dtype=torch.float32,
         )
+        self.self_attention = SelfAttention(
+            enc_config,
             q_embed_dim=embed_dim,
             kv_embed_dim=embed_dim,
+            num_query_heads=enc_config.num_attention_heads,
+            num_kv_heads=enc_config.num_key_value_heads,
             head_dim=enc_config.head_dim,
             compute_dtype=compute_dtype,
             out_embed_dim=embed_dim,
         )
         self.post_sa_norm = RMSNorm(
             embed_dim,
+            eps=enc_config.norm_eps,
             dtype=torch.float32,
         )
         self.mlp = MlpBlock(
             embed_dim=embed_dim,
+            intermediate_dim=enc_config.intermediate_size,
             compute_dtype=compute_dtype,
         )
         state: EncoderInferenceState,
     ) -> torch.Tensor:
         residual = x
+        x_norm = self.pre_sa_norm(x).to(self.compute_dtype)
         sa_out = self.self_attention(
+            X=x_norm,
             q_positions=state.positions,
             kv_positions=state.positions,
             attn_mask=state.attn_mask,
         x = residual + sa_out
         residual = x
+        x_norm = self.post_sa_norm(x).to(self.compute_dtype)
         mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
+        enc_config = config.encoder_config
+        self.compute_dtype = compute_dtype
         self.embedding = nn.Embedding(
+            enc_config.vocab_size,
+            enc_config.hidden_size,
             dtype=compute_dtype,
         )
         self.layers = nn.ModuleList(
+            [
+                EncoderLayer(config, compute_dtype)
+                for _ in range(enc_config.num_hidden_layers)
+            ]
         )
         self.norm = RMSNorm(
+            enc_config.hidden_size,
+            eps=enc_config.norm_eps,
             dtype=torch.float32,
         )
         for layer in self.layers:
             x = layer(x, state)
+        x = self.norm(x).to(self.compute_dtype)
         return x
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
+        dec_config = config.decoder_config
+        enc_config = config.encoder_config
+        dec_embed_dim = dec_config.hidden_size
+        enc_embed_dim = enc_config.hidden_size
+        self.compute_dtype = compute_dtype
         # Norms
         self.pre_sa_norm = RMSNorm(
             dec_embed_dim,
+            eps=dec_config.norm_eps,
             dtype=torch.float32,
         )
         self.pre_ca_norm = RMSNorm(
             dec_embed_dim,
+            eps=dec_config.norm_eps,
             dtype=torch.float32,
         )
         self.pre_mlp_norm = RMSNorm(
             dec_embed_dim,
+            eps=dec_config.norm_eps,
             dtype=torch.float32,
         )
         # Self-Attention (GQA) with Causal Masking
+        self.self_attention = SelfAttention(
+            dec_config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=dec_embed_dim,
+            num_query_heads=dec_config.num_attention_heads,
+            num_kv_heads=dec_config.num_key_value_heads,
+            head_dim=dec_config.head_dim,
             compute_dtype=compute_dtype,
             out_embed_dim=dec_embed_dim,
         )
         # Cross-Attention (MHA)
+        self.cross_attention = CrossAttention(
+            dec_config,
             q_embed_dim=dec_embed_dim,
             kv_embed_dim=enc_embed_dim,  # Note kv_embed_dim
+            num_query_heads=dec_config.cross_num_attention_heads,
+            num_kv_heads=dec_config.cross_num_key_value_heads,
             head_dim=dec_config.cross_head_dim,
             compute_dtype=compute_dtype,
             out_embed_dim=dec_embed_dim,
         )
         # MLP
         self.mlp = MlpBlock(
             embed_dim=dec_embed_dim,
+            intermediate_dim=dec_config.intermediate_size,
             compute_dtype=compute_dtype,
         )
         self_attn_cache: KVCache | None = None,
         cross_attn_cache: KVCache | None = None,
         prefill: bool = False,
+        current_idx: int = 0,
     ) -> torch.Tensor:
         residual = x
+        x_norm = self.pre_sa_norm(x).to(self.compute_dtype)
+        self_attn_mask = state.casual_attn_mask[None, None, current_idx]
         sa_out = self.self_attention(
+            X=x_norm,  # (2, 1, D)
             q_positions=state.dec_positions,  # (2, 1)
             kv_positions=state.dec_positions,  # (2, 1)
+            attn_mask=self_attn_mask,
             cache=self_attn_cache,
             prefill=prefill,
             is_causal=prefill,
+            current_idx=current_idx,
         )
         x = residual + sa_out
         residual = x
+        x_norm = self.pre_ca_norm(x).to(self.compute_dtype)
         ca_out = self.cross_attention(
             Xq=x_norm,
             q_positions=state.dec_positions,
             kv_positions=state.enc_positions,
+            attn_mask=state.cross_attn_mask,
             cache=cross_attn_cache,
         )
         x = residual + ca_out
         residual = x
+        x_norm = self.pre_mlp_norm(x).to(self.compute_dtype)
         mlp_out = self.mlp(x_norm)
         x = residual + mlp_out
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):
         super().__init__()
         self.config = config
+        dec_config = config.decoder_config
+        self.num_channels = dec_config.num_channels
+        self.num_layers = dec_config.num_hidden_layers
         self.embeddings = nn.ModuleList(
             [
                 nn.Embedding(
+                    dec_config.vocab_size, dec_config.hidden_size, dtype=compute_dtype
                 )
                 for _ in range(self.num_channels)
             ]
         )
         self.norm = RMSNorm(
+            dec_config.hidden_size,
+            eps=dec_config.norm_eps,
             dtype=torch.float32,
         )
         self.logits_dense = DenseGeneral(
+            in_shapes=(dec_config.hidden_size,),
+            out_features=(self.num_channels, dec_config.vocab_size),
             axis=(-1,),
             weight_dtype=compute_dtype,
         )
     def precompute_cross_attn_cache(
         self,
         enc_out: torch.Tensor,  # (B, S, E)
     ) -> list[KVCache]:
         """
         Computes the Key and Value tensors for cross-attention for each layer from the encoder output.
             k_proj = cross_attn_module.k_proj(enc_out)
             v_proj = cross_attn_module.v_proj(enc_out)
             k = k_proj.transpose(1, 2)
             v = v_proj.transpose(1, 2)
         self,
         tgt_ids_Bx1xC: torch.Tensor,  # [B, 1, C]
         state: DecoderInferenceState,
+        current_idx: int,
     ) -> torch.Tensor:
         """
         Performs a single decoding step, managing KV caches layer by layer.
         Returns:
             A tuple containing:
             - logits_Bx1xCV: The final output logits for the current step (B, 1, C*V), cast to float32.
                 state,
                 self_attn_cache=self_cache,
                 cross_attn_cache=cross_cache,
+                current_idx=current_idx,
             )
         x = self.norm(x)
     ) -> torch.Tensor:
         """
         Forward pass for the Decoder stack, managing KV caches.
         Args:
             tgt_ids_BxTxC: Target token IDs (B, T, C).
             encoder_out: Output from the encoder (B, S, E).
             precomputed_cross_attn_kv: A single tuple containing the pre-computed K/V cache
                                       derived from `encoder_out`. This is passed identically
                                       to all layers.
         Returns:
             A tuple containing:
             - logits: The final output logits (B, T, C * V), cast to float32.
         return logits_BxTxCxV.to(torch.float32)
+class DiaModel(
+    nn.Module,
+    PyTorchModelHubMixin,
+    repo_url="https://github.com/nari-labs/dia",
+    pipeline_tag="text-to-speech",
+    license="apache-2.0",
+    coders={
+        DiaConfig: (
+            lambda x: x.model_dump(),
+            lambda data: DiaConfig.model_validate(data),
+        ),
+    },
+):
     """PyTorch Dia Model using DenseGeneral."""
     def __init__(self, config: DiaConfig, compute_dtype: torch.dtype):

dia/model.py CHANGED Viewed

@@ -1,17 +1,16 @@
 import time
 from enum import Enum
-import dac
 import numpy as np
 import torch
 import torchaudio
-from huggingface_hub import hf_hub_download
 from .audio import (
     apply_audio_delay,
     build_delay_indices,
     build_revert_indices,
-    decode,
     revert_audio_delay,
 )
 from .config import DiaConfig
@@ -20,6 +19,7 @@ from .state import DecoderInferenceState, DecoderOutput, EncoderInferenceState
 DEFAULT_SAMPLE_RATE = 44100
 def _get_default_device():
@@ -34,16 +34,29 @@ def _sample_next_token(
     logits_BCxV: torch.Tensor,
     temperature: float,
     top_p: float,
-    cfg_filter_top_k: int | None = None,
 ) -> torch.Tensor:
     if temperature == 0.0:
         return torch.argmax(logits_BCxV, dim=-1)
     logits_BCxV = logits_BCxV / temperature
-    if cfg_filter_top_k is not None:
-        _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=cfg_filter_top_k, dim=-1)
         mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
-        mask.scatter_(dim=-1, index=top_k_indices_BCxV, value=False)
         logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
     if top_p < 1.0:
@@ -54,13 +67,15 @@ def _sample_next_token(
         cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
         sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
-        sorted_indices_to_remove_BCxV[..., 1:] = sorted_indices_to_remove_BCxV[
-            ..., :-1
-        ].clone()
-        sorted_indices_to_remove_BCxV[..., 0] = 0
         indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
-        indices_to_remove_BCxV.scatter_(
             dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV
         )
         logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
@@ -94,12 +109,15 @@ class Dia:
         config: DiaConfig,
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
     ):
         """Initializes the Dia model.
         Args:
             config: The configuration object for the model.
             device: The device to load the model onto. If None, will automatically select the best available device.
         Raises:
             RuntimeError: If there is an error loading the DAC model.
@@ -110,8 +128,16 @@ class Dia:
         if isinstance(compute_dtype, str):
             compute_dtype = ComputeDtype(compute_dtype)
         self.compute_dtype = compute_dtype.to_dtype()
-        self.model = DiaModel(config, self.compute_dtype)
         self.dac_model = None
     @classmethod
     def from_local(
@@ -120,13 +146,16 @@ class Dia:
         checkpoint_path: str,
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
     ) -> "Dia":
         """Loads the Dia model from local configuration and checkpoint files.
         Args:
             config_path: Path to the configuration JSON file.
             checkpoint_path: Path to the model checkpoint (.pth) file.
             device: The device to load the model onto. If None, will automatically select the best available device.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
@@ -139,7 +168,7 @@ class Dia:
         if config is None:
             raise FileNotFoundError(f"Config file not found at {config_path}")
-        dia = cls(config, compute_dtype, device)
         try:
             state_dict = torch.load(checkpoint_path, map_location=dia.device)
@@ -153,15 +182,17 @@ class Dia:
         dia.model.to(dia.device)
         dia.model.eval()
-        dia._load_dac_model()
         return dia
     @classmethod
     def from_pretrained(
         cls,
-        model_name: str = "nari-labs/Dia-1.6B",
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
     ) -> "Dia":
         """Loads the Dia model from a Hugging Face Hub repository.
@@ -169,8 +200,10 @@ class Dia:
         repository ID and then loads the model.
         Args:
-            model_name: The Hugging Face Hub repository ID (e.g., "NariLabs/Dia-1.6B").
             device: The device to load the model onto. If None, will automatically select the best available device.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
@@ -179,110 +212,192 @@ class Dia:
             FileNotFoundError: If config or checkpoint download/loading fails.
             RuntimeError: If there is an error loading the checkpoint.
         """
-        config_path = hf_hub_download(repo_id=model_name, filename="config.json")
-        checkpoint_path = hf_hub_download(repo_id=model_name, filename="dia-v0_1.pth")
-        return cls.from_local(config_path, checkpoint_path, compute_dtype, device)
     def _load_dac_model(self):
         try:
             dac_model_path = dac.utils.download()
             dac_model = dac.DAC.load(dac_model_path).to(self.device)
         except Exception as e:
             raise RuntimeError("Failed to load DAC model") from e
         self.dac_model = dac_model
-    def _prepare_text_input(self, text: str) -> torch.Tensor:
-        """Encodes text prompt, pads, and creates attention mask and positions."""
-        text_pad_value = self.config.data.text_pad_value
-        max_len = self.config.data.text_length
         byte_text = text.encode("utf-8")
         replaced_bytes = byte_text.replace(b"[S1]", b"\x01").replace(b"[S2]", b"\x02")
         text_tokens = list(replaced_bytes)
-        current_len = len(text_tokens)
-        padding_needed = max_len - current_len
-        if padding_needed <= 0:
-            text_tokens = text_tokens[:max_len]
-            padded_text_np = np.array(text_tokens, dtype=np.uint8)
-        else:
-            padded_text_np = np.pad(
-                text_tokens,
-                (0, padding_needed),
-                mode="constant",
-                constant_values=text_pad_value,
-            ).astype(np.uint8)
-        src_tokens = (
-            torch.from_numpy(padded_text_np).to(torch.long).to(self.device).unsqueeze(0)
-        )  # [1, S]
         return src_tokens
     def _prepare_audio_prompt(
-        self, audio_prompt: torch.Tensor | None
-    ) -> tuple[torch.Tensor, int]:
-        num_channels = self.config.data.channels
-        audio_bos_value = self.config.data.audio_bos_value
-        audio_pad_value = self.config.data.audio_pad_value
-        delay_pattern = self.config.data.delay_pattern
-        max_delay_pattern = max(delay_pattern)
-        prefill = torch.full(
-            (1, num_channels),
-            fill_value=audio_bos_value,
-            dtype=torch.int,
-            device=self.device,
-        )
-        prefill_step = 1
-        if audio_prompt is not None:
-            prefill_step += audio_prompt.shape[0]
-            prefill = torch.cat([prefill, audio_prompt], dim=0)
-        delay_pad_tensor = torch.full(
-            (max_delay_pattern, num_channels),
             fill_value=-1,
             dtype=torch.int,
             device=self.device,
         )
-        prefill = torch.cat([prefill, delay_pad_tensor], dim=0)
         delay_precomp = build_delay_indices(
-            B=1,
-            T=prefill.shape[0],
             C=num_channels,
             delay_pattern=delay_pattern,
         )
-        prefill = apply_audio_delay(
-            audio_BxTxC=prefill.unsqueeze(0),
-            pad_value=audio_pad_value,
             bos_value=audio_bos_value,
             precomp=delay_precomp,
-        ).squeeze(0)
-        return prefill, prefill_step
     def _prepare_generation(
-        self, text: str, audio_prompt: str | torch.Tensor | None, verbose: bool
     ):
-        enc_input_cond = self._prepare_text_input(text)
-        enc_input_uncond = torch.zeros_like(enc_input_cond)
-        enc_input = torch.cat([enc_input_uncond, enc_input_cond], dim=0)
-        if isinstance(audio_prompt, str):
-            audio_prompt = self.load_audio(audio_prompt)
-        prefill, prefill_step = self._prepare_audio_prompt(audio_prompt)
-        if verbose:
-            print("generate: data loaded")
         enc_state = EncoderInferenceState.new(self.config, enc_input_cond)
         encoder_out = self.model.encoder(enc_input, enc_state)
         dec_cross_attn_cache = self.model.decoder.precompute_cross_attn_cache(
-            encoder_out, enc_state.positions
         )
         dec_state = DecoderInferenceState.new(
             self.config,
@@ -290,15 +405,18 @@ class Dia:
             encoder_out,
             dec_cross_attn_cache,
             self.compute_dtype,
         )
-        dec_output = DecoderOutput.new(self.config, self.device)
-        dec_output.prefill(prefill, prefill_step)
-        dec_step = prefill_step - 1
         if dec_step > 0:
             dec_state.prepare_step(0, dec_step)
-            tokens_BxTxC = (
-                dec_output.get_tokens_at(0, dec_step).unsqueeze(0).expand(2, -1, -1)
             )
             self.model.decoder.forward(tokens_BxTxC, dec_state)
@@ -311,43 +429,114 @@ class Dia:
         cfg_scale: float,
         temperature: float,
         top_p: float,
-        cfg_filter_top_k: int,
     ) -> torch.Tensor:
-        audio_eos_value = self.config.data.audio_eos_value
-        logits_Bx1xCxV = self.model.decoder.decode_step(tokens_Bx1xC, dec_state)
-        logits_last_BxCxV = logits_Bx1xCxV[:, -1, :, :]
-        uncond_logits_CxV = logits_last_BxCxV[0, :, :]
-        cond_logits_CxV = logits_last_BxCxV[1, :, :]
-        logits_CxV = cond_logits_CxV + cfg_scale * (cond_logits_CxV - uncond_logits_CxV)
-        logits_CxV[:, audio_eos_value + 1 :] = -torch.inf
-        logits_CxV[1:, audio_eos_value:] = -torch.inf
-        pred_C = _sample_next_token(
-            logits_CxV.float(),
             temperature=temperature,
             top_p=top_p,
-            cfg_filter_top_k=cfg_filter_top_k,
         )
-        return pred_C
-    def _generate_output(self, generated_codes: torch.Tensor) -> np.ndarray:
-        num_channels = self.config.data.channels
-        seq_length = generated_codes.shape[0]
-        delay_pattern = self.config.data.delay_pattern
-        audio_pad_value = self.config.data.audio_pad_value
         max_delay_pattern = max(delay_pattern)
         revert_precomp = build_revert_indices(
-            B=1,
             T=seq_length,
             C=num_channels,
             delay_pattern=delay_pattern,
         )
         codebook = revert_audio_delay(
-            audio_BxTxC=generated_codes.unsqueeze(0),
             pad_value=audio_pad_value,
             precomp=revert_precomp,
             T=seq_length,
@@ -358,20 +547,85 @@ class Dia:
         invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
         codebook[invalid_mask] = 0
-        audio = decode(self.dac_model, codebook.transpose(1, 2))
-        return audio.squeeze().cpu().numpy()
     def load_audio(self, audio_path: str) -> torch.Tensor:
         audio, sr = torchaudio.load(audio_path, channels_first=True)  # C, T
         if sr != DEFAULT_SAMPLE_RATE:
             audio = torchaudio.functional.resample(audio, sr, DEFAULT_SAMPLE_RATE)
-        audio = audio.to(self.device).unsqueeze(0)  # 1, C, T
-        audio_data = self.dac_model.preprocess(audio, DEFAULT_SAMPLE_RATE)
-        _, encoded_frame, _, _, _ = self.dac_model.encode(audio_data)  # 1, C, T
-        return encoded_frame.squeeze(0).transpose(0, 1)
     def save_audio(self, path: str, audio: np.ndarray):
         import soundfile as sf
         sf.write(path, audio, DEFAULT_SAMPLE_RATE)
@@ -379,23 +633,63 @@ class Dia:
     @torch.inference_mode()
     def generate(
         self,
-        text: str,
-        max_tokens: int | None = None,
         cfg_scale: float = 3.0,
-        temperature: float = 1.3,
         top_p: float = 0.95,
         use_torch_compile: bool = False,
-        cfg_filter_top_k: int = 35,
-        audio_prompt: str | torch.Tensor | None = None,
-        audio_prompt_path: str | None = None,
         use_cfg_filter: bool | None = None,
         verbose: bool = False,
-    ) -> np.ndarray:
-        audio_eos_value = self.config.data.audio_eos_value
-        audio_pad_value = self.config.data.audio_pad_value
-        delay_pattern = self.config.data.delay_pattern
-        max_tokens = self.config.data.audio_length if max_tokens is None else max_tokens
         max_delay_pattern = max(delay_pattern)
         self.model.eval()
         if audio_prompt_path:
@@ -407,82 +701,179 @@ class Dia:
         if verbose:
             total_start_time = time.time()
-        dec_state, dec_output = self._prepare_generation(text, audio_prompt, verbose)
-        dec_step = dec_output.prefill_step - 1
-        bos_countdown = max_delay_pattern
-        eos_detected = False
-        eos_countdown = -1
-        if use_torch_compile:
-            step_fn = torch.compile(self._decoder_step, mode="default")
         else:
-            step_fn = self._decoder_step
         if verbose:
             print("generate: starting generation loop")
             if use_torch_compile:
                 print(
-                    "generate: by using use_torch_compile=True, the first step would take long"
                 )
             start_time = time.time()
         while dec_step < max_tokens:
             dec_state.prepare_step(dec_step)
-            tokens_Bx1xC = (
-                dec_output.get_tokens_at(dec_step).unsqueeze(0).expand(2, -1, -1)
-            )
-            pred_C = step_fn(
                 tokens_Bx1xC,
                 dec_state,
                 cfg_scale,
                 temperature,
                 top_p,
                 cfg_filter_top_k,
             )
-            if (
-                not eos_detected and pred_C[0] == audio_eos_value
-            ) or dec_step == max_tokens - max_delay_pattern - 1:
-                eos_detected = True
-                eos_countdown = max_delay_pattern
-            if eos_countdown > 0:
-                step_after_eos = max_delay_pattern - eos_countdown
-                for i, d in enumerate(delay_pattern):
-                    if step_after_eos == d:
-                        pred_C[i] = audio_eos_value
-                    elif step_after_eos > d:
-                        pred_C[i] = audio_pad_value
-                eos_countdown -= 1
-            bos_countdown = max(0, bos_countdown - 1)
-            dec_output.update_one(pred_C, dec_step + 1, bos_countdown > 0)
-            if eos_countdown == 0:
-                break
             dec_step += 1
             if verbose and dec_step % 86 == 0:
                 duration = time.time() - start_time
-                print(
-                    f"generate step {dec_step}: speed={86 / duration:.3f} tokens/s, realtime factor={1 / duration:.3f}x"
-                )
                 start_time = time.time()
-        if dec_output.prefill_step >= dec_step + 1:
-            print("Warning: Nothing generated")
-            return None
-        generated_codes = dec_output.generated_tokens[
-            dec_output.prefill_step : dec_step + 1, :
-        ]
-        if verbose:
-            total_step = dec_step + 1 - dec_output.prefill_step
-            total_duration = time.time() - total_start_time
-            print(
-                f"generate: total step={total_step}, total duration={total_duration:.3f}s"
             )
-        return self._generate_output(generated_codes)

 import time
 from enum import Enum
+from typing import Callable
 import numpy as np
 import torch
+import torch.nn.functional as F
 import torchaudio
 from .audio import (
     apply_audio_delay,
     build_delay_indices,
     build_revert_indices,
     revert_audio_delay,
 )
 from .config import DiaConfig
 DEFAULT_SAMPLE_RATE = 44100
+SAMPLE_RATE_RATIO = 512
 def _get_default_device():
     logits_BCxV: torch.Tensor,
     temperature: float,
     top_p: float,
+    top_k: int | None,
+    audio_eos_value: int,
 ) -> torch.Tensor:
     if temperature == 0.0:
         return torch.argmax(logits_BCxV, dim=-1)
     logits_BCxV = logits_BCxV / temperature
+    if audio_eos_value is not None and audio_eos_value >= 0:
+        top_logit_indices_BC = torch.argmax(logits_BCxV, dim=-1)
+        eos_not_highest_mask_BC = top_logit_indices_BC != audio_eos_value
+        mask_eos_unless_highest_BCxV = torch.zeros_like(logits_BCxV, dtype=torch.bool)
+        mask_eos_unless_highest_BCxV[eos_not_highest_mask_BC, audio_eos_value] = True
+        logits_BCxV = logits_BCxV.masked_fill(mask_eos_unless_highest_BCxV, -torch.inf)
+        eos_highest_mask_BC = top_logit_indices_BC == audio_eos_value
+        mask_eos_highest_BCxV = torch.zeros_like(logits_BCxV, dtype=torch.bool)
+        mask_eos_highest_BCxV[eos_highest_mask_BC, :audio_eos_value] = True
+        logits_BCxV = logits_BCxV.masked_fill(mask_eos_highest_BCxV, -torch.inf)
+    if top_k is not None:
+        _, top_k_indices_BCxV = torch.topk(logits_BCxV, k=top_k, dim=-1)
         mask = torch.ones_like(logits_BCxV, dtype=torch.bool)
+        mask = mask.scatter(dim=-1, index=top_k_indices_BCxV, value=False)
         logits_BCxV = logits_BCxV.masked_fill(mask, -torch.inf)
     if top_p < 1.0:
         cumulative_probs_BCxV = torch.cumsum(sorted_probs_BCxV, dim=-1)
         sorted_indices_to_remove_BCxV = cumulative_probs_BCxV > top_p
+        sorted_indices_to_remove_BCxV = torch.roll(
+            sorted_indices_to_remove_BCxV, shifts=1, dims=-1
+        )
+        sorted_indices_to_remove_BCxV[..., 0] = torch.zeros_like(
+            sorted_indices_to_remove_BCxV[..., 0]
+        )
         indices_to_remove_BCxV = torch.zeros_like(sorted_indices_to_remove_BCxV)
+        indices_to_remove_BCxV = indices_to_remove_BCxV.scatter(
             dim=-1, index=sorted_indices_BCxV, src=sorted_indices_to_remove_BCxV
         )
         logits_BCxV = logits_BCxV.masked_fill(indices_to_remove_BCxV, -torch.inf)
         config: DiaConfig,
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
+        load_dac: bool = True,
     ):
         """Initializes the Dia model.
         Args:
             config: The configuration object for the model.
+            compute_dtype: The computation dtype to use.
             device: The device to load the model onto. If None, will automatically select the best available device.
+            load_dac: Whether to load the DAC model.
         Raises:
             RuntimeError: If there is an error loading the DAC model.
         if isinstance(compute_dtype, str):
             compute_dtype = ComputeDtype(compute_dtype)
         self.compute_dtype = compute_dtype.to_dtype()
+        self.model: DiaModel = DiaModel(config, self.compute_dtype)
         self.dac_model = None
+        self._compiled_step = None
+        self.load_dac = load_dac
+        if not self.load_dac:
+            print("Warning: DAC model will not be loaded. This is not recommended.")
+        if torch.cuda.is_available():
+            torch.backends.cuda.matmul.allow_tf32 = True
     @classmethod
     def from_local(
         checkpoint_path: str,
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
+        load_dac: bool = True,
     ) -> "Dia":
         """Loads the Dia model from local configuration and checkpoint files.
         Args:
             config_path: Path to the configuration JSON file.
             checkpoint_path: Path to the model checkpoint (.pth) file.
+            compute_dtype: The computation dtype to use.
             device: The device to load the model onto. If None, will automatically select the best available device.
+            load_dac: Whether to load the DAC model.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
         if config is None:
             raise FileNotFoundError(f"Config file not found at {config_path}")
+        dia = cls(config, compute_dtype, device, load_dac)
         try:
             state_dict = torch.load(checkpoint_path, map_location=dia.device)
         dia.model.to(dia.device)
         dia.model.eval()
+        if load_dac:
+            dia._load_dac_model()
         return dia
     @classmethod
     def from_pretrained(
         cls,
+        model_name: str = "nari-labs/Dia-1.6B-0626",
         compute_dtype: str | ComputeDtype = ComputeDtype.FLOAT32,
         device: torch.device | None = None,
+        load_dac: bool = True,
     ) -> "Dia":
         """Loads the Dia model from a Hugging Face Hub repository.
         repository ID and then loads the model.
         Args:
+            model_name: The Hugging Face Hub repository ID (e.g., "nari-labs/Dia-1.6B-0626").
+            compute_dtype: The computation dtype to use.
             device: The device to load the model onto. If None, will automatically select the best available device.
+            load_dac: Whether to load the DAC model.
         Returns:
             An instance of the Dia model loaded with weights and set to eval mode.
             FileNotFoundError: If config or checkpoint download/loading fails.
             RuntimeError: If there is an error loading the checkpoint.
         """
+        if isinstance(compute_dtype, str):
+            compute_dtype = ComputeDtype(compute_dtype)
+        # Load model directly using DiaModel's from_pretrained which handles HF download
+        try:
+            loaded_model = DiaModel.from_pretrained(
+                model_name, compute_dtype=compute_dtype.to_dtype()
+            )
+        except Exception as e:
+            raise RuntimeError(
+                f"Error loading model from Hugging Face Hub ({model_name})"
+            ) from e
+        config = loaded_model.config  # Get config from the loaded model
+        dia = cls(config, compute_dtype, device, load_dac)
+        dia.model = loaded_model  # Assign the already loaded model
+        dia.model.to(dia.device)
+        dia.model.eval()
+        if load_dac:
+            dia._load_dac_model()
+        return dia
     def _load_dac_model(self):
+        """Loads the Descript Audio Codec (DAC) model.
+        Downloads the DAC model if necessary and loads it onto the specified device.
+        Sets the DAC model to evaluation mode.
+        Raises:
+            RuntimeError: If downloading or loading the DAC model fails.
+        """
+        import dac
         try:
             dac_model_path = dac.utils.download()
             dac_model = dac.DAC.load(dac_model_path).to(self.device)
+            dac_model.eval()  # Ensure DAC is in eval mode
         except Exception as e:
             raise RuntimeError("Failed to load DAC model") from e
         self.dac_model = dac_model
+    def _encode_text(self, text: str) -> torch.Tensor:
+        """Encodes the input text string into a tensor of token IDs using byte-level encoding.
+        Special tokens [S1] and [S2] are replaced by their byte values. The resulting
+        sequence is truncated to the maximum configured text length.
+        Args:
+            text: The input text string.
+        Returns:
+            A tensor containing the encoded byte token IDs.
+        """
+        max_len = self.config.encoder_config.max_position_embeddings
         byte_text = text.encode("utf-8")
+        # Replace special tokens with their byte values if needed by the specific tokenizer/config
+        # Assuming byte values 1 and 2 are correct placeholders based on original code
         replaced_bytes = byte_text.replace(b"[S1]", b"\x01").replace(b"[S2]", b"\x02")
         text_tokens = list(replaced_bytes)
+        return torch.tensor(
+            text_tokens[:max_len],
+            dtype=torch.long,
+            device=self.device,
+        )
+    def _pad_text_input(self, text_tokens: list[torch.Tensor]) -> torch.Tensor:
+        """Pads the text input to the maximum length."""
+        text_pad_value = 0
+        max_len = self.config.encoder_config.max_position_embeddings
+        batch_size = len(text_tokens)
+        src_tokens = torch.full(
+            (batch_size, 1, max_len),
+            fill_value=text_pad_value,
+            dtype=torch.long,
+            device=self.device,
+        )
+        for i in range(batch_size):
+            current_len = len(text_tokens[i])
+            src_tokens[i, 0, :current_len] = text_tokens[i]
         return src_tokens
     def _prepare_audio_prompt(
+        self, audio_prompts: list[torch.Tensor | None]
+    ) -> tuple[torch.Tensor, list[int]]:
+        """Prepares the audio prompt tensor for the decoder.
+        Handles padding, adds the beginning-of-sequence (BOS) token, applies the
+        delay pattern, and determines the number of prefill steps for each item
+        in the batch.
+        Args:
+            audio_prompts: A list of audio prompt tensors (encoded DAC frames) or None.
+                           Each tensor should have shape [T, C].
+        Returns:
+            A tuple containing:
+                - delayed_batch (torch.Tensor): The prepared audio prompt tensor with
+                  delays applied, shape [B, T_max_padded, C].
+                - prefill_steps (list[int]): A list containing the number of valid
+                  tokens (including BOS) for each prompt in the batch.
+        """
+        num_channels = self.config.decoder_config.num_channels
+        audio_bos_value = self.config.bos_token_id
+        delay_pattern = self.config.delay_pattern
+        max_delay_pattern = max(delay_pattern)
+        batch_size = len(audio_prompts)
+        max_len = (
+            max(p.shape[0] if p is not None else 0 for p in audio_prompts)
+            + max_delay_pattern
+        )
+        prefill_steps = []
+        prefill = torch.full(
+            (batch_size, max_len, num_channels),
             fill_value=-1,
             dtype=torch.int,
             device=self.device,
         )
+        prefill[:, 0, :] = audio_bos_value
+        for i in range(batch_size):
+            prompt = audio_prompts[i]
+            if prompt is not None:
+                prompt = prompt.to(device=self.device, dtype=torch.int)
+                prefill[i, 1 : prompt.shape[0] + 1, :] = prompt
+                prefill_steps.append(prompt.shape[0] + 1)
+            else:
+                prefill_steps.append(1)
         delay_precomp = build_delay_indices(
+            B=batch_size,
+            T=max_len,
             C=num_channels,
             delay_pattern=delay_pattern,
         )
+        delayed_batch = apply_audio_delay(
+            audio_BxTxC=prefill,
+            pad_value=-1,
             bos_value=audio_bos_value,
             precomp=delay_precomp,
+        )
+        return delayed_batch, prefill_steps
     def _prepare_generation(
+        self,
+        text: torch.Tensor,
+        audio_prompts: list[torch.Tensor | None],
+        max_tokens: int | None = None,
+        attn_fn: Callable = F.scaled_dot_product_attention,
     ):
+        """Initializes the model state for generation.
+        Encodes the text input (conditional and unconditional), prepares the
+        encoder and decoder states (including KV caches and cross-attention),
+        prepares the audio prompt, and performs the initial decoder prefill steps
+        based on the audio prompts.
+        Args:
+            text: The padded text input tensor, shape [B, 1, T_text].
+            audio_prompts: A list of prepared audio prompt tensors or None.
+        Returns:
+            A tuple containing:
+                - dec_state (DecoderInferenceState): The initialized decoder state.
+                - dec_output (DecoderOutput): The initialized decoder output manager,
+                  containing the prefilled audio tokens.
+        """
+        batch_size = text.shape[0]
+        enc_input_uncond = torch.zeros_like(text)
+        enc_input_cond = text
+        stacked_inputs = torch.stack([enc_input_uncond, enc_input_cond], dim=1)
+        enc_input = stacked_inputs.view(2 * batch_size, -1)
         enc_state = EncoderInferenceState.new(self.config, enc_input_cond)
         encoder_out = self.model.encoder(enc_input, enc_state)
         dec_cross_attn_cache = self.model.decoder.precompute_cross_attn_cache(
+            encoder_out
         )
         dec_state = DecoderInferenceState.new(
             self.config,
             encoder_out,
             dec_cross_attn_cache,
             self.compute_dtype,
+            max_generation_length=max_tokens,
         )
+        prefill, prefill_steps = self._prepare_audio_prompt(audio_prompts)
+        dec_output = DecoderOutput.new(batch_size, self.config, self.device)
+        dec_output.prefill(prefill, prefill_steps)
+        dec_step = min(prefill_steps) - 1
         if dec_step > 0:
             dec_state.prepare_step(0, dec_step)
+            tokens_BxTxC = dec_output.get_tokens_at(0, dec_step).repeat_interleave(
+                2, dim=0
             )
             self.model.decoder.forward(tokens_BxTxC, dec_state)
         cfg_scale: float,
         temperature: float,
         top_p: float,
+        top_k: int,
+        current_idx: int,
     ) -> torch.Tensor:
+        """Performs a single step of the decoder inference.
+        Takes the tokens from the previous step, runs them through the decoder
+        (for both conditional and unconditional paths), applies classifier-free
+        guidance (CFG), samples the next token using temperature, top-p, and top-k
+        sampling, and applies constraints (e.g., preventing EOS in certain channels).
+        Args:
+            tokens_Bx1xC: The input tokens for the current step, shape [2*B, 1, C].
+                         Repeated for CFG (unconditional and conditional).
+            dec_state: The current state of the decoder (KV caches, etc.).
+            cfg_scale: The scale factor for classifier-free guidance.
+            temperature: The temperature for sampling.
+            top_p: The cumulative probability threshold for top-p sampling.
+            top_k: The number of top logits to consider for top-k sampling.
+            current_idx: The current generation step index.
+        Returns:
+            torch.Tensor: The sampled next tokens for each item in the batch,
+                          shape [B, C].
+        """
+        B = tokens_Bx1xC.shape[0] // 2
+        audio_eos_value = self.config.eos_token_id
+        logits_Bx1xCxV = self.model.decoder.decode_step(
+            tokens_Bx1xC, dec_state, current_idx
+        )
+        logits_last_2BxCxV = logits_Bx1xCxV[:, -1]
+        logits_last_Bx2xCxV = logits_last_2BxCxV.view(
+            B, 2, *logits_last_2BxCxV.shape[1:]
+        )
+        uncond_logits_BxCxV = logits_last_Bx2xCxV[:, 0, :, :]  # Shape [B, C, V]
+        cond_logits_BxCxV = logits_last_Bx2xCxV[:, 1, :, :]  # Shape [B, C, V]
+        logits_BxCxV = cond_logits_BxCxV + cfg_scale * (
+            cond_logits_BxCxV - uncond_logits_BxCxV
+        )
+        _, top_k_indices_BxCxk = torch.topk(logits_BxCxV, k=top_k, dim=-1)
+        mask_BxCxV = torch.ones_like(logits_BxCxV, dtype=torch.bool)
+        mask_BxCxV = mask_BxCxV.scatter(dim=-1, index=top_k_indices_BxCxk, value=False)
+        logits_BxCxV = cond_logits_BxCxV.masked_fill(mask_BxCxV, -torch.inf)
+        logits_BxCxV[:, :, audio_eos_value + 1 :] = torch.full_like(
+            logits_BxCxV[:, :, audio_eos_value + 1 :],
+            fill_value=-torch.inf,
+        )
+        logits_BxCxV[:, 1:, audio_eos_value:] = torch.full_like(
+            logits_BxCxV[:, 1:, audio_eos_value:],
+            fill_value=-torch.inf,
+        )
+        flat_logits_BCxV = logits_BxCxV.view(
+            B * self.config.decoder_config.num_channels, -1
+        )
+        pred_BC = _sample_next_token(
+            flat_logits_BCxV.float(),
             temperature=temperature,
             top_p=top_p,
+            top_k=top_k,
+            audio_eos_value=audio_eos_value,
         )
+        pred_BxC = pred_BC.view(B, self.config.decoder_config.num_channels)
+        return pred_BxC
+    def _generate_output(
+        self, generated_codes: torch.Tensor, lengths_Bx: torch.Tensor
+    ) -> list[np.ndarray]:
+        """Converts generated delayed codes into audio waveforms.
+        Reverts the delay pattern applied during generation, decodes the resulting
+        codebook using the DAC model (if loaded), and returns a list of audio
+        waveforms as NumPy arrays. If DAC is not loaded, returns the raw codebook indices.
+        Args:
+            generated_codes: The tensor of generated audio codes with delays,
+                             shape [B, T_gen, C].
+            lengths_Bx: A tensor containing the valid length of generated codes
+                        (excluding padding and BOS/EOS markers) for each item
+                        in the batch, shape [B].
+        Returns:
+            A list of NumPy arrays, where each array represents the generated audio
+            waveform for one item in the batch. If DAC is not loaded, returns the
+            raw, reverted codebook indices as NumPy arrays.
+        """
+        num_channels = self.config.decoder_config.num_channels
+        batch_size = generated_codes.shape[0]
+        seq_length = generated_codes.shape[1]
+        delay_pattern = self.config.delay_pattern
+        audio_pad_value = self.config.pad_token_id
         max_delay_pattern = max(delay_pattern)
         revert_precomp = build_revert_indices(
+            B=batch_size,
             T=seq_length,
             C=num_channels,
             delay_pattern=delay_pattern,
         )
         codebook = revert_audio_delay(
+            audio_BxTxC=generated_codes,
             pad_value=audio_pad_value,
             precomp=revert_precomp,
             T=seq_length,
         invalid_mask = (codebook < min_valid_index) | (codebook > max_valid_index)
         codebook[invalid_mask] = 0
+        audios = []
+        if self.load_dac:
+            for i in range(batch_size):
+                audio = self._decode(codebook[i, : lengths_Bx[i], :])
+                audio_np = audio.cpu().numpy()
+                audios.append(audio_np)
+        else:
+            for i in range(batch_size):
+                audios.append(codebook[i, : lengths_Bx[i], :].cpu().numpy())
+        return audios
+    @torch.no_grad()
+    @torch.inference_mode()
+    def _encode(self, audio: torch.Tensor) -> torch.Tensor:
+        """
+        Encodes the given audio waveform into a tensor of DAC codebook indices
+        """
+        audio = audio.unsqueeze(0)
+        audio_data = self.dac_model.preprocess(audio, DEFAULT_SAMPLE_RATE)
+        _, encoded_frame, _, _, _ = self.dac_model.encode(audio_data)
+        encoded_frame: torch.Tensor
+        return encoded_frame.squeeze(0).transpose(0, 1)
+    @torch.no_grad()
+    @torch.inference_mode()
+    def _decode(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        """
+        Decodes the given frames into an output audio waveform
+        """
+        audio_codes = audio_codes.unsqueeze(0).transpose(1, 2)
+        audio_values, _, _ = self.dac_model.quantizer.from_codes(audio_codes)
+        audio_values = self.dac_model.decode(audio_values)
+        audio_values: torch.Tensor
+        return audio_values.squeeze()
     def load_audio(self, audio_path: str) -> torch.Tensor:
+        """Loads and preprocesses an audio file for use as a prompt.
+        Loads the audio file, resamples it to the target sample rate if necessary,
+        preprocesses it using the DAC model's preprocessing, and encodes it into
+        DAC codebook indices.
+        Args:
+            audio_path: Path to the audio file.
+        Returns:
+            torch.Tensor: The encoded audio prompt as DAC codebook indices,
+                          shape [T, C].
+        Raises:
+            RuntimeError: If the DAC model is not loaded (`load_dac=False` during init).
+            FileNotFoundError: If the audio file cannot be found.
+            Exception: If there's an error during loading or processing.
+        """
+        if self.dac_model is None:
+            raise RuntimeError(
+                "DAC model is required for loading audio prompts but was not loaded."
+            )
         audio, sr = torchaudio.load(audio_path, channels_first=True)  # C, T
         if sr != DEFAULT_SAMPLE_RATE:
             audio = torchaudio.functional.resample(audio, sr, DEFAULT_SAMPLE_RATE)
+        # Convert to mono if stereo
+        if audio.shape[0] > 1:
+            audio = torch.mean(
+                audio, dim=0, keepdim=True
+            )  # Average channels to get mono
+        return self._encode(audio.to(self.device))
     def save_audio(self, path: str, audio: np.ndarray):
+        """Saves the generated audio waveform to a file.
+        Uses the soundfile library to write the NumPy audio array to the specified
+        path with the default sample rate.
+        Args:
+            path: The path where the audio file will be saved.
+            audio: The audio waveform as a NumPy array.
+        """
         import soundfile as sf
         sf.write(path, audio, DEFAULT_SAMPLE_RATE)
     @torch.inference_mode()
     def generate(
         self,
+        text: str | list[str],
+        max_tokens: int = 3072,
         cfg_scale: float = 3.0,
+        temperature: float = 1.2,
         top_p: float = 0.95,
         use_torch_compile: bool = False,
+        cfg_filter_top_k: int = 45,
+        audio_prompt: list[str | torch.Tensor | None]
+        | str
+        | torch.Tensor
+        | None = None,
+        audio_prompt_path: list[str | torch.Tensor | None]
+        | str
+        | torch.Tensor
+        | None = None,
         use_cfg_filter: bool | None = None,
         verbose: bool = False,
+    ) -> np.ndarray | list[np.ndarray]:
+        """Generates audio corresponding to the input text.
+        Args:
+            text: The input text prompt, or a list of text prompts for batch generation.
+            max_tokens: The maximum number of audio tokens to generate per prompt.
+                        Defaults to the model's configured audio length if None.
+            cfg_scale: The scale factor for classifier-free guidance (CFG). Higher values
+                       lead to stronger guidance towards the text prompt.
+            temperature: The temperature for sampling. Higher values increase randomness.
+            top_p: The cumulative probability threshold for nucleus (top-p) sampling.
+            use_torch_compile: Whether to compile the generation steps using torch.compile.
+                               Can significantly speed up generation after the initial
+                               compilation overhead. Defaults to False.
+            cfg_filter_top_k: The number of top logits to consider during CFG filtering.
+                              (Note: This parameter name might be slightly misleading based
+                              on the code; it's used in the `_sample_next_token` function.)
+            audio_prompt: An audio prompt or list of prompts to condition the generation.
+                          Can be a file path (str), a pre-loaded tensor (DAC codes), or None.
+                          If a list, its length must match the batch size of the text input.
+            audio_prompt_path: (Deprecated) Use `audio_prompt` instead.
+            use_cfg_filter: (Deprecated) This parameter is no longer used.
+            verbose: If True, prints progress information during generation, including
+                     speed metrics.
+        Returns:
+            If a single text prompt was provided, returns a NumPy array containing the
+            generated audio waveform.
+            If a list of text prompts was provided, returns a list of NumPy arrays,
+            each corresponding to a prompt in the input list. Returns None for a
+            sequence if no audio was generated for it.
+        """
+        batch_size = len(text) if isinstance(text, list) else 1
+        audio_eos_value = self.config.eos_token_id
+        audio_pad_value = self.config.pad_token_id
+        delay_pattern = self.config.delay_pattern
         max_delay_pattern = max(delay_pattern)
+        delay_pattern_Cx = torch.tensor(
+            delay_pattern, device=self.device, dtype=torch.long
+        )
         self.model.eval()
         if audio_prompt_path:
         if verbose:
             total_start_time = time.time()
+        if use_torch_compile and not hasattr(self, "_compiled"):
+            # Compilation can take about a minute.
+            self._prepare_generation = torch.compile(
+                self._prepare_generation, dynamic=True, fullgraph=True
+            )
+            self._decoder_step = torch.compile(
+                self._decoder_step, fullgraph=True, mode="max-autotune"
+            )
+            self._compiled = True
+        if isinstance(audio_prompt, list):
+            audio_prompt = [
+                self.load_audio(p) if isinstance(p, str) else p for p in audio_prompt
+            ]
+        elif isinstance(audio_prompt, str):
+            audio_prompt = [self.load_audio(audio_prompt)]
+        elif isinstance(audio_prompt, torch.Tensor):
+            audio_prompt = [audio_prompt]
+        elif audio_prompt is None:
+            audio_prompt = [None] * batch_size
+        assert len(audio_prompt) == batch_size, (
+            "Number of audio prompts must match batch size"
+        )
+        if isinstance(text, list):
+            text = [self._encode_text(t) for t in text]
         else:
+            text = [self._encode_text(text)]
+        text = self._pad_text_input(text)
+        dec_state, dec_output = self._prepare_generation(
+            text, audio_prompt, max_tokens=max_tokens
+        )
+        dec_step = min(dec_output.prefill_steps) - 1
+        current_idx = torch.tensor([dec_step], device=self.device)
+        eos_detected_Bx = torch.zeros(
+            (batch_size,), dtype=torch.bool, device=self.device
+        )
+        eos_countdown_Bx = torch.full(
+            (batch_size,), -1, dtype=torch.long, device=self.device
+        )
+        finished_step_Bx = torch.full(
+            (batch_size,), -1, dtype=torch.long, device=self.device
+        )
+        bos_over = False
         if verbose:
             print("generate: starting generation loop")
             if use_torch_compile:
                 print(
+                    "generate: using use_torch_compile=True, the first step may be slow"
                 )
             start_time = time.time()
+        # --- Generation Loop ---
         while dec_step < max_tokens:
+            if (eos_countdown_Bx == 0).all():
+                break
+            current_step_idx = dec_step + 1
+            torch.compiler.cudagraph_mark_step_begin()
             dec_state.prepare_step(dec_step)
+            tokens_Bx1xC = dec_output.get_tokens_at(dec_step).repeat_interleave(
+                2, dim=0
+            )  # Repeat for CFG
+            pred_BxC = self._decoder_step(
                 tokens_Bx1xC,
                 dec_state,
                 cfg_scale,
                 temperature,
                 top_p,
                 cfg_filter_top_k,
+                current_idx,
             )
+            current_idx += 1
+            active_mask_Bx = eos_countdown_Bx != 0
+            eos_trigger_Bx = torch.zeros_like(active_mask_Bx)
+            if active_mask_Bx.any():
+                is_eos_token = (~eos_detected_Bx[active_mask_Bx]) & (
+                    pred_BxC[active_mask_Bx, 0] == audio_eos_value
+                )
+                is_max_len = current_step_idx >= max_tokens - max_delay_pattern
+                eos_trigger_Bx[active_mask_Bx] = is_eos_token | is_max_len
+            eos_detected_Bx |= eos_trigger_Bx
+            start_countdown_mask_Bx = eos_trigger_Bx & (eos_countdown_Bx < 0)
+            if start_countdown_mask_Bx.any():
+                eos_countdown_Bx[start_countdown_mask_Bx] = max_delay_pattern
+                finished_step_Bx[start_countdown_mask_Bx] = current_step_idx
+            padding_mask_Bx = eos_countdown_Bx > 0
+            if padding_mask_Bx.any():
+                pred_active_BxC = pred_BxC[padding_mask_Bx].clone()
+                countdown_active_Bx = eos_countdown_Bx[padding_mask_Bx]
+                step_after_eos_Bx = max_delay_pattern - countdown_active_Bx
+                step_after_eos_Bx_ = step_after_eos_Bx.unsqueeze(1)
+                delay_pattern_Cx_ = delay_pattern_Cx.unsqueeze(0)
+                eos_mask_NxC = step_after_eos_Bx_ == delay_pattern_Cx_
+                pad_mask_NxC = step_after_eos_Bx_ > delay_pattern_Cx_
+                pred_active_BxC[eos_mask_NxC] = audio_eos_value
+                pred_active_BxC[pad_mask_NxC] = audio_pad_value
+                pred_BxC[padding_mask_Bx] = pred_active_BxC
+                eos_countdown_Bx[padding_mask_Bx] -= 1
+            # --- Update BOS flag (Original) ---
+            if not bos_over:
+                bos_over = all(
+                    dec_step - prefill_step > max_delay_pattern
+                    for prefill_step in dec_output.prefill_steps
+                )
+            dec_output.update_one(pred_BxC, current_step_idx, not bos_over)
             dec_step += 1
             if verbose and dec_step % 86 == 0:
                 duration = time.time() - start_time
+                if duration > 0:
+                    print(
+                        f"generate step {dec_step}: speed={86 * batch_size / duration:.3f} tokens/s, realtime factor={batch_size / duration:.3f}x"
+                    )
                 start_time = time.time()
+        # --- Finalize and Extract Output ---
+        final_step = dec_step + 1
+        finished_step_Bx[finished_step_Bx == -1] = final_step - max_delay_pattern
+        prefill_steps_tensor = torch.tensor(
+            dec_output.prefill_steps, device=self.device
+        )
+        lengths_Bx = finished_step_Bx - prefill_steps_tensor
+        lengths_Bx = torch.clamp(lengths_Bx, min=0)
+        max_len = lengths_Bx.max().item() + max_delay_pattern
+        outputs = []
+        if max_len > 0:
+            num_channels = self.config.decoder_config.num_channels
+            audio_pad_value = self.config.pad_token_id
+            generated_codes = torch.full(
+                (batch_size, max_len, num_channels),
+                fill_value=audio_pad_value,
+                dtype=torch.long,
+                device=self.device,
             )
+            for i in range(batch_size):
+                start_step = dec_output.prefill_steps[i]
+                actual_len = lengths_Bx[i].item() + max_delay_pattern
+                if actual_len > 0:
+                    tokens_to_copy = dec_output.generated_tokens[
+                        i, start_step : start_step + actual_len, :
+                    ]
+                    generated_codes[i, :actual_len, :] = tokens_to_copy
+            if verbose:
+                avg_steps = lengths_Bx.float().mean().item()
+                total_duration = time.time() - total_start_time
+                print(
+                    f"generate: avg steps={avg_steps:.1f}, total duration={total_duration:.3f}s"
+                )
+            del dec_state
+            outputs = self._generate_output(generated_codes, lengths_Bx)
+        else:
+            print("Warning: Nothing generated for any sequence in the batch.")
+            outputs = [None] * batch_size
+        return outputs if batch_size > 1 else outputs[0]

dia/state.py CHANGED Viewed

@@ -1,4 +1,5 @@
 from dataclasses import dataclass
 import torch
@@ -14,29 +15,18 @@ def create_attn_mask(
     """
     Creates the attention mask (self or cross) mimicking JAX segment ID logic.
     """
-    B1, Tq = q_padding_mask_1d.shape
-    B2, Tk = k_padding_mask_1d.shape
-    assert B1 == B2, "Query and key batch dimensions must match"
     p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
     p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
-    # Condition A: Non-padding query attends to non-padding key
-    non_pad_attends_non_pad = p_mask_q & p_mask_k  # Shape [B, Tq, Tk]
-    # Condition B: Padding query attends to padding key
-    pad_attends_pad = (~p_mask_q) & (~p_mask_k)  # Shape [B, Tq, Tk]
-    # Combine: True if padding status is compatible (both non-pad OR both pad)
-    mask = non_pad_attends_non_pad | pad_attends_pad  # Shape [B, Tq, Tk]
     if is_causal:
-        assert Tq == Tk, (
-            "Causal mask requires query and key sequence lengths to be equal"
-        )
         causal_mask_2d = torch.tril(
-            torch.ones((Tq, Tk), dtype=torch.bool, device=device)
-        )  # Shape [Tq, Tk]
         causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
         return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
     else:
@@ -58,19 +48,18 @@ class EncoderInferenceState:
         """Creates EtorchrInferenceParams from DiaConfig and a device."""
         device = cond_src.device
-        positions = (
-            torch.arange(config.data.text_length, device=device)
-            .to(torch.long)
-            .unsqueeze(0)
-            .expand(2, -1)
-        )
-        padding_mask = (cond_src != config.data.text_pad_value).to(device).expand(2, -1)
         attn_mask = create_attn_mask(
             padding_mask, padding_mask, device, is_causal=False
         )
         return cls(
-            max_seq_len=config.data.text_length,
             device=device,
             positions=positions,
             padding_mask=padding_mask,
@@ -78,9 +67,13 @@ class EncoderInferenceState:
         )
-class KVCache:
     def __init__(
         self,
         num_heads: int,
         max_len: int,
         head_dim: int,
@@ -89,21 +82,33 @@ class KVCache:
         k: torch.Tensor | None = None,
         v: torch.Tensor | None = None,
     ):
-        self.k = (
-            torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device)
             if k is None
             else k
         )
-        self.v = (
-            torch.zeros((2, num_heads, max_len, head_dim), dtype=dtype, device=device)
             if v is None
             else v
         )
-        self.current_idx = torch.tensor(0)
     @classmethod
     def from_kv(cls, k: torch.Tensor, v: torch.Tensor) -> "KVCache":
         return cls(
             num_heads=k.shape[1],
             max_len=k.shape[2],
             head_dim=k.shape[3],
@@ -114,20 +119,17 @@ class KVCache:
         )
     def update(
-        self, k: torch.Tensor, v: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor]:
-        self.k[:, :, self.current_idx : self.current_idx + 1, :] = k
-        self.v[:, :, self.current_idx : self.current_idx + 1, :] = v
-        self.current_idx += 1
-        return self.k[:, :, : self.current_idx, :], self.v[:, :, : self.current_idx, :]
-    def prefill(
-        self, k: torch.Tensor, v: torch.Tensor
-    ) -> tuple[torch.Tensor, torch.Tensor]:
         prefill_len = k.shape[2]
         self.k[:, :, :prefill_len, :] = k
         self.v[:, :, :prefill_len, :] = v
-        self.current_idx = prefill_len - 1
 @dataclass
@@ -139,9 +141,10 @@ class DecoderInferenceState:
     enc_out: torch.Tensor
     enc_positions: torch.Tensor
     dec_positions: torch.Tensor
-    dec_cross_attn_mask: torch.Tensor
     self_attn_cache: list[KVCache]
     cross_attn_cache: list[KVCache]
     @classmethod
     def new(
@@ -151,28 +154,36 @@ class DecoderInferenceState:
         enc_out: torch.Tensor,
         dec_cross_attn_cache: list[KVCache],
         compute_dtype: torch.dtype,
     ) -> "DecoderInferenceState":
         """Creates DecoderInferenceParams from DiaConfig and a device."""
         device = enc_out.device
-        max_audio_len = config.data.audio_length
         dec_positions = torch.full(
-            (2, 1), fill_value=0, dtype=torch.long, device=device
         )
-        tgt_padding_mask = torch.ones((2, 1), dtype=torch.bool, device=device)
-        dec_cross_attn_mask = create_attn_mask(
-            tgt_padding_mask, enc_state.padding_mask, device, is_causal=False
         )
         self_attn_cache = [
             KVCache(
-                config.model.decoder.kv_heads,
                 max_audio_len,
-                config.model.decoder.gqa_head_dim,
                 compute_dtype,
                 device,
             )
-            for _ in range(config.model.decoder.n_layer)
         ]
         return cls(
@@ -181,54 +192,56 @@ class DecoderInferenceState:
             enc_out=enc_out,
             enc_positions=enc_state.positions,
             dec_positions=dec_positions,
-            dec_cross_attn_mask=dec_cross_attn_mask,
             self_attn_cache=self_attn_cache,
             cross_attn_cache=dec_cross_attn_cache,
         )
     def prepare_step(self, step_from: int, step_to: int | None = None) -> None:
         if step_to is None:
             step_to = step_from + 1
-        self.dec_positions = (
-            torch.arange(step_from, step_to, device=self.device)
-            .unsqueeze(0)
-            .expand(2, -1)
-        )
 @dataclass
 class DecoderOutput:
     generated_tokens: torch.Tensor
-    prefill_step: int
     @classmethod
-    def new(cls, config: DiaConfig, device: torch.device) -> "DecoderOutput":
-        max_audio_len = config.data.audio_length
         return cls(
             generated_tokens=torch.full(
-                (max_audio_len, config.data.channels),
                 fill_value=-1,
                 dtype=torch.int,
                 device=device,
             ),
-            prefill_step=0,
         )
     def get_tokens_at(self, step_from: int, step_to: int | None = None) -> torch.Tensor:
         if step_to is None:
             step_to = step_from + 1
-        return self.generated_tokens[step_from:step_to, :]
     def update_one(self, dec_out: torch.Tensor, step: int, apply_mask: bool = False):
         if apply_mask:
-            mask = self.generated_tokens[step : step + 1, :] == -1
-            self.generated_tokens[step : step + 1, :] = torch.where(
-                mask, dec_out, self.generated_tokens[step : step + 1, :]
             )
         else:
-            self.generated_tokens[step : step + 1, :] = dec_out
-    def prefill(self, dec_out: torch.Tensor, prefill_step: int):
-        length = dec_out.shape[0]
-        self.generated_tokens[0:length, :] = dec_out
-        self.prefill_step = prefill_step

 from dataclasses import dataclass
+from typing import Optional
 import torch
     """
     Creates the attention mask (self or cross) mimicking JAX segment ID logic.
     """
+    # B1, Tq = q_padding_mask_1d.shape
+    # B2, Tk = k_padding_mask_1d.shape
     p_mask_q = q_padding_mask_1d.unsqueeze(2)  # Shape [B, Tq, 1]
     p_mask_k = k_padding_mask_1d.unsqueeze(1)  # Shape [B, 1, Tk]
+    mask = p_mask_q & p_mask_k
     if is_causal:
+        # assert Tq == Tk, "Causal mask requires query and key sequence lengths to be equal"
         causal_mask_2d = torch.tril(
+            torch.ones_like(mask[0], dtype=torch.bool, device=device)
+        )  # Shape [B, Tq, Tk]
         causal_mask = mask & causal_mask_2d  # Shape [B, Tq, Tk]
         return causal_mask.unsqueeze(1)  # Shape [B, 1, Tq, Tk]
     else:
         """Creates EtorchrInferenceParams from DiaConfig and a device."""
         device = cond_src.device
+        positions = torch.arange(
+            config.encoder_config.max_position_embeddings,
+            dtype=torch.float32,
+            device=device,
+        ).unsqueeze(0)
+        padding_mask = (cond_src.squeeze(1) != 0).to(device).repeat_interleave(2, dim=0)
         attn_mask = create_attn_mask(
             padding_mask, padding_mask, device, is_causal=False
         )
         return cls(
+            max_seq_len=config.encoder_config.max_position_embeddings,
             device=device,
             positions=positions,
             padding_mask=padding_mask,
         )
+class KVCache(torch.nn.Module):
+    k: torch.Tensor
+    v: torch.Tensor
     def __init__(
         self,
+        batch_size: int,
         num_heads: int,
         max_len: int,
         head_dim: int,
         k: torch.Tensor | None = None,
         v: torch.Tensor | None = None,
     ):
+        k = (
+            torch.zeros(
+                (2 * batch_size, num_heads, max_len, head_dim),
+                dtype=dtype,
+                device=device,
+            )
             if k is None
             else k
         )
+        v = (
+            torch.zeros(
+                (2 * batch_size, num_heads, max_len, head_dim),
+                dtype=dtype,
+                device=device,
+            )
             if v is None
             else v
         )
+        super().__init__()
+        self.register_buffer("k", k)
+        self.register_buffer("v", v)
     @classmethod
     def from_kv(cls, k: torch.Tensor, v: torch.Tensor) -> "KVCache":
         return cls(
+            batch_size=k.shape[0] // 2,
             num_heads=k.shape[1],
             max_len=k.shape[2],
             head_dim=k.shape[3],
         )
     def update(
+        self, k: torch.Tensor, v: torch.Tensor, current_idx: torch.Tensor
     ) -> tuple[torch.Tensor, torch.Tensor]:
+        k_out, v_out = self.k, self.v
+        k_out[:, :, current_idx, :] = k
+        v_out[:, :, current_idx, :] = v
+        return self.k, self.v
+    def prefill(self, k: torch.Tensor, v: torch.Tensor):
         prefill_len = k.shape[2]
         self.k[:, :, :prefill_len, :] = k
         self.v[:, :, :prefill_len, :] = v
 @dataclass
     enc_out: torch.Tensor
     enc_positions: torch.Tensor
     dec_positions: torch.Tensor
     self_attn_cache: list[KVCache]
     cross_attn_cache: list[KVCache]
+    casual_attn_mask: torch.Tensor
+    cross_attn_mask: torch.Tensor
     @classmethod
     def new(
         enc_out: torch.Tensor,
         dec_cross_attn_cache: list[KVCache],
         compute_dtype: torch.dtype,
+        max_generation_length: Optional[int] = None,
     ) -> "DecoderInferenceState":
         """Creates DecoderInferenceParams from DiaConfig and a device."""
         device = enc_out.device
+        max_audio_len = (
+            max_generation_length or config.decoder_config.max_position_embeddings
+        )
+        batch_size = enc_out.shape[0] // 2
         dec_positions = torch.full(
+            (2 * batch_size, 1), fill_value=0, dtype=torch.int32, device=device
+        )
+        causal_mask = torch.tril(
+            torch.ones(max_audio_len, max_audio_len, dtype=torch.bool, device=device)
         )
+        dec_mask = torch.ones((2 * batch_size, 1), dtype=torch.bool, device=device)
+        cross_attn_mask = create_attn_mask(
+            dec_mask, enc_state.padding_mask, device, is_causal=False
         )
         self_attn_cache = [
             KVCache(
+                batch_size,
+                config.decoder_config.num_key_value_heads,
                 max_audio_len,
+                config.decoder_config.head_dim,
                 compute_dtype,
                 device,
             )
+            for _ in range(config.decoder_config.num_hidden_layers)
         ]
         return cls(
             enc_out=enc_out,
             enc_positions=enc_state.positions,
             dec_positions=dec_positions,
             self_attn_cache=self_attn_cache,
             cross_attn_cache=dec_cross_attn_cache,
+            casual_attn_mask=causal_mask,
+            cross_attn_mask=cross_attn_mask,
         )
     def prepare_step(self, step_from: int, step_to: int | None = None) -> None:
         if step_to is None:
             step_to = step_from + 1
+        self.dec_positions = torch.arange(
+            step_from, step_to, dtype=torch.int32, device=self.device
+        ).unsqueeze(0)
 @dataclass
 class DecoderOutput:
     generated_tokens: torch.Tensor
+    prefill_steps: list[int]
     @classmethod
+    def new(
+        cls, batch_size: int, config: DiaConfig, device: torch.device
+    ) -> "DecoderOutput":
+        max_audio_len = config.decoder_config.max_position_embeddings
         return cls(
             generated_tokens=torch.full(
+                (batch_size, max_audio_len, config.decoder_config.num_channels),
                 fill_value=-1,
                 dtype=torch.int,
                 device=device,
             ),
+            prefill_steps=[],
         )
     def get_tokens_at(self, step_from: int, step_to: int | None = None) -> torch.Tensor:
         if step_to is None:
             step_to = step_from + 1
+        return self.generated_tokens[:, step_from:step_to, :]
     def update_one(self, dec_out: torch.Tensor, step: int, apply_mask: bool = False):
+        dec_out = dec_out.to(self.generated_tokens.dtype)
         if apply_mask:
+            mask = self.generated_tokens[:, step, :] == -1
+            self.generated_tokens[:, step, :] = torch.where(
+                mask, dec_out, self.generated_tokens[:, step, :]
             )
         else:
+            self.generated_tokens[:, step, :] = dec_out
+    def prefill(self, dec_out: torch.Tensor, prefill_steps: list[int]):
+        length = dec_out.shape[1]
+        self.generated_tokens[:, :length, :] = dec_out
+        self.prefill_steps = prefill_steps

requirements.txt CHANGED Viewed

@@ -4,6 +4,7 @@ huggingface-hub>=0.30.2
 numpy>=2.2.4
 pydantic>=2.11.3
 soundfile>=0.13.1
-torchaudio>=2.0.0
-torch>=2.0.0
 gradio-dialogue>=0.0.4

 numpy>=2.2.4
 pydantic>=2.11.3
 soundfile>=0.13.1
+torchaudio==2.6.0
+torch==2.6.0
+triton==3.2.0
 gradio-dialogue>=0.0.4