Spaces:

AbstractPhil
/

shunt-adapter-testing

Runtime error

App Files Files Community

AbstractPhil commited on Jul 15

Commit

788f431

verified ·

1 Parent(s): 1fe00bb

Update two_stream_shunt_adapter.py

Browse files

Files changed (1) hide show

two_stream_shunt_adapter.py +376 -81

two_stream_shunt_adapter.py CHANGED Viewed

@@ -1,115 +1,410 @@
-# adapter_v2.py ────────────────────────────────────────────────────────────
-import torch, math
 import torch.nn as nn
-import torch.nn.functional as F
-# ─── Residual pocket block ────────────────────────────────────────────────
-class PocketBlock(nn.Module):
     def __init__(self, dim, kernel=3, dropout=0.0):
         super().__init__()
-        self.body = nn.Sequential(
-            nn.LayerNorm(dim),
-            nn.Conv1d(dim, dim, kernel, padding=kernel // 2, groups=1),
             nn.GELU(),
-            nn.Conv1d(dim, dim, kernel, padding=kernel // 2, groups=1),
-            nn.Dropout(dropout),
         )
     def forward(self, x):
-        y = self.body(x.transpose(1, 2)).transpose(1, 2)
-        return x + y
-# ─── adapter ──────────────────────────────────────────────────────────────
-class TwoStreamShuntAdapter(nn.Module):
-    """T5-seq  ➔ bottleneck  ⇄  CLIP-seq  →  anchor / delta / σ …"""
-    def __init__(self, cfg: dict):
         super().__init__()
-        self.cfg       = cfg
-        hid_t5         = cfg["t5"]["hidden_size"]
-        hid_clip       = cfg["clip"]["hidden_size"]
-        bneck          = cfg["bottleneck"]
-        heads          = cfg["heads"]
-        proj_layers    = cfg.get("proj_layers", 2)
-        use_norm       = cfg.get("layer_norm", True)
-        p_drop         = cfg.get("dropout", 0.0)
-        pocket_depth   = cfg.get("pocket_depth", 2)
-        # helper ----------------------------------------------------------------
-        def proj(in_d, out_d):
-            layers, d = [], in_d
-            for i in range(proj_layers):
-                if use_norm:
-                    layers.append(nn.LayerNorm(d))
-                layers += [nn.Linear(d, bneck if i == proj_layers - 1 else bneck * 2),
-                           nn.GELU()]
-                if p_drop: layers.append(nn.Dropout(p_drop))
-                d = bneck
             return nn.Sequential(*layers)
-        # projections -----------------------------------------------------------
-        self.t5_in   = proj(hid_t5,   bneck)
-        self.clip_in = proj(hid_clip, bneck)
-        # bidirectional cross-attention ----------------------------------------
-        self.attn_t2c = nn.MultiheadAttention(bneck, heads, batch_first=True, dropout=p_drop)
-        self.attn_c2t = nn.MultiheadAttention(bneck, heads, batch_first=True, dropout=p_drop)
-        self.tau      = nn.Parameter(torch.full((heads, 1, 1), cfg.get("tau_init", 1.0)))
-        # pocket stack ----------------------------------------------------------
-        self.pocket = nn.Sequential(*[PocketBlock(bneck, dropout=p_drop) for _ in range(pocket_depth)])
-        # fuse bottleneck → bneck ----------------------------------------------
         self.fuse = nn.Sequential(
-            nn.LayerNorm(bneck * 2),
-            nn.Linear(bneck * 2, bneck * 2),
             nn.GELU(),
-            nn.Linear(bneck * 2, bneck)
         )
-        # head projections ------------------------------------------------------
-        self.anchor_out = proj(bneck, hid_clip)
-        self.delta_out  = proj(bneck, hid_clip)
-        self.sigma_out  = proj(bneck, hid_clip)    # log σ
-        self.gate_guid_proj = nn.Sequential(
-            nn.LayerNorm(bneck),
-            nn.Linear(bneck, bneck),
             nn.GELU(),
-            nn.Linear(bneck, 2),                   # [:, :, 0] → gate, [:, :, 1] → g_pred
         )
-        self.max_guidance = cfg.get("max_guidance", 2.0)
-    # --- forward --------------------------------------------------------------
-    def forward(self, t5_seq: torch.Tensor, clip_seq: torch.Tensor):
-        assert t5_seq.size(-1)   == self.cfg["t5"]["hidden_size"]
-        assert clip_seq.size(-1) == self.cfg["clip"]["hidden_size"]
-        t5_b   = self.t5_in(t5_seq)
-        clip_b = self.clip_in(clip_seq)
-        t2c, attn_t2c = self.attn_t2c(t5_b,  clip_b, clip_b, need_weights=True, average_attn_weights=False)
-        c2t, attn_c2t = self.attn_c2t(clip_b, t5_b,  t5_b,  need_weights=True, average_attn_weights=False)
-        p   = self.pocket(t2c)
-        z   = torch.cat([p.mean(1, keepdim=True).expand_as(c2t), c2t], dim=-1)
-        h   = self.fuse(z)
-        anchor    = self.anchor_out(h)
-        delta     = self.delta_out(h)
-        log_sigma = self.sigma_out(h)
-        gate_and_g = self.gate_guid_proj(h)
-        gate   = torch.sigmoid(gate_and_g[..., 0:1])
-        g_pred = torch.clamp(gate_and_g[..., 1:2].mean(1, keepdim=True),
-                             0, self.max_guidance)
-        return (anchor, delta, log_sigma,
-                attn_t2c, attn_c2t,
-                self.tau,
-                g_pred,
-                gate)

+from typing import Tuple
+import torch
 import torch.nn as nn
+from .configs import ENCODER_CONFIGS, HARMONIC_SHUNT_REPOS
+class DualConversionNames:
+    """
+    Mapping from legacy dual adapter layer names to updated
+    condition/modulation schema. Also supports delta/gate harmonization.
+    """
+    LAYER_NAMES = {
+        # Projection remapping
+        "t5_proj":       "condition_projection",
+        "clip_proj":     "modulation_projection",
+        # Cross attention
+        "cross_t2c":     "cross_c2m",   # condition to modulation
+        "cross_c2t":     "cross_m2c",   # modulation to condition
+        # Output projections
+        "anchor_proj":   "anchor_projection",
+        "delta_proj":    "delta_projection",
+        "logsig_proj":   "log_sigma_projection",
+        # Gate and guidance
+        "gate_proj":     "gate_projection",
+        "guidance_proj": "guidance_projection",
+        # Fuse block
+        "fuse":          "fusion_block",
+        # Pocket residual
+        "pocket_blocks": "residual_pocket_block"
+    }
+# ─── Residual Pocket Block ───────────────────────────────────
+class BottleneckResBlock(nn.Module):
     def __init__(self, dim, kernel=3, dropout=0.0):
         super().__init__()
+        self.norm = nn.LayerNorm(dim)
+        self.conv = nn.Conv1d(dim, dim, kernel_size=kernel, padding=kernel // 2, groups=1)
+        self.proj = nn.Sequential(
+            nn.Linear(dim, dim * 2),
             nn.GELU(),
+            nn.Linear(dim * 2, dim),
+            nn.Dropout(dropout)
         )
     def forward(self, x):
+        residual = x
+        x = self.norm(x)
+        x = x.transpose(1, 2)
+        x = self.conv(x).transpose(1, 2)
+        return residual + self.proj(x)
+class ConditionModulationShuntAdapter(nn.Module):
+    def __init__(self, config: dict):
+        super().__init__()
+        self.config = config
+        self.dtype = config.get("dtype", torch.float32)
+        self.condition_dim = config.get("condition_encoders", [])[0].get("hidden_size", 768)
+        self.modulation_dim = config.get("modulation_encoders", [])[0].get("hidden_size", 768)
+        self.bneck = config["bottleneck"]
+        self.heads = config["heads"]
+        self.tau_init = config["tau_init"]
+        self.max_guidance = config["max_guidance"]
+        use_norm   = config.get("layer_norm", True)
+        use_do     = config.get("use_dropout", True)
+        do_p       = config.get("dropout", 0.0)
+        proj_depth = config.get("proj_layers", 2)
+        def build_projection(input_dim, output_dim):
+            layers = []
+            last_dim = input_dim
+            if use_norm:
+                layers.append(nn.LayerNorm(last_dim))
+            for i in range(proj_depth):
+                next_dim = self.bneck * (2 if i == 0 and proj_depth > 1 else 1)
+                layers.append(nn.Linear(last_dim, next_dim))
+                layers.append(nn.GELU())
+                if use_do:
+                    layers.append(nn.Dropout(do_p))
+                last_dim = next_dim
+            layers.append(nn.Linear(last_dim, output_dim))
+            return nn.Sequential(*layers)
+        # Projection layers
+        self.condition_projection = build_projection(self.condition_dim, self.bneck)
+        self.modulation_projection = build_projection(self.modulation_dim, self.bneck)
+        # Cross attention blocks
+        self.cross_c2m = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.cross_m2c = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.tau       = nn.Parameter(torch.full((self.heads, 1, 1), self.tau_init))
+        # Residual processing block
+        self.residual_pocket_block = nn.Sequential(
+            BottleneckResBlock(self.bneck, dropout=do_p),
+            BottleneckResBlock(self.bneck, dropout=do_p)
+        )
+        # Fusion pathway
+        self.fusion_block = nn.Sequential(
+            nn.LayerNorm(2 * self.bneck),
+            nn.Linear(2 * self.bneck, self.bneck * 2),
+            nn.GELU(),
+            nn.Linear(self.bneck * 2, self.bneck)
+        )
+        # Output projections
+        self.anchor_projection    = build_projection(self.bneck, self.modulation_dim)
+        self.delta_projection     = build_projection(self.bneck, self.modulation_dim)
+        self.log_sigma_projection = build_projection(self.bneck, self.modulation_dim)
+        # Gate and guidance
+        self.gate_projection = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, self.bneck),
+            nn.GELU(),
+            nn.Linear(self.bneck, 1),
+            nn.Tanh(),
+            nn.Sigmoid()
+        )
+        self.guidance_projection = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, 1),
+            nn.Sigmoid()
+        )
+        # ─── Legacy Aliases (Version 1 Compatibility) ──────────────────────────
+        self.proj_t5        = self.condition_projection
+        self.proj_clip      = self.modulation_projection
+        self.cross_t2c      = self.cross_c2m
+        self.cross_c2t      = self.cross_m2c
+        self.pocket_blocks  = self.residual_pocket_block
+        self.fuse           = self.fusion_block
+        self.anchor_proj    = self.anchor_projection
+        self.delta_proj     = self.delta_projection
+        self.logsig_proj    = self.log_sigma_projection
+        self.gate_proj      = self.gate_projection
+        self.guidance_proj  = self.guidance_projection
+    def forward(self, cond_seq: torch.Tensor, mod_seq: torch.Tensor, config: dict = None):
+        if self.config.get("assert_input_dims", True):
+            assert cond_seq.size(-1) == self.condition_dim
+            assert mod_seq.size(-1) == self.modulation_dim
+        max_guidance = self.max_guidance if config is None else config.get("max_guidance", 0.0)
+        if max_guidance <= 0:
+            max_guidance = self.max_guidance
+        if max_guidance <= 0:
+            max_guidance = config.get("guidance_scale", 10.0)
+        cond_b = self.condition_projection(cond_seq)
+        mod_b  = self.modulation_projection(mod_seq)
+        c2m, attn_c2m = self.cross_c2m(cond_b, mod_b, mod_b, need_weights=True, average_attn_weights=False)
+        m2c, attn_m2c = self.cross_m2c(mod_b, cond_b, cond_b, need_weights=True, average_attn_weights=False)
+        pocket = self.residual_pocket_block(c2m)
+        pocket_mean = pocket.mean(1, keepdim=True).expand(-1, mod_b.size(1), -1)
+        h = self.fusion_block(torch.cat([pocket_mean, m2c], dim=-1))
+        anchor    = self.anchor_projection(h)
+        delta     = self.delta_projection(h) * self.gate_projection(h)
+        log_sigma = self.log_sigma_projection(h)
+        g_tok  = self.guidance_projection(h).squeeze(-1)
+        g_pred = g_tok.mean(1, keepdim=True) * max_guidance
+        return anchor, delta, log_sigma, attn_c2m, attn_m2c, self.tau, g_pred, self.gate_projection(h)
+# ─── V1 Original Two Stream Shunt Adapter ──────────────────────────────────────
+class TwoStreamShuntAdapter(nn.Module):
+    def __init__(self, config: dict):
         super().__init__()
+        self.config = config
+        self.dtype = config.get("dtype", torch.float32)
+        self.t5_dim = config.get("condition_encoders", [])[0].get("hidden_size", 768)
+        self.clip_dim = config.get("modulation_encoders", [])[0].get("hidden_size", 768)
+        self.bneck = config["bottleneck"]
+        self.heads = config["heads"]
+        self.tau_init = config["tau_init"]
+        self.max_guidance = config["max_guidance"]
+        use_norm   = config.get("layer_norm", True)
+        use_do     = config.get("use_dropout", True)
+        do_p       = config.get("dropout", 0.0)
+        proj_depth = config.get("proj_layers", 2)
+        def build_projection(input_dim, output_dim):
+            layers = []
+            last_dim = input_dim
+            if use_norm:
+                layers.append(nn.LayerNorm(last_dim))
+            for i in range(proj_depth):
+                next_dim = self.bneck * (2 if i == 0 and proj_depth > 1 else 1)
+                layers.append(nn.Linear(last_dim, next_dim))
+                layers.append(nn.GELU())
+                if use_do:
+                    layers.append(nn.Dropout(do_p))
+                last_dim = next_dim
+            layers.append(nn.Linear(last_dim, output_dim))
             return nn.Sequential(*layers)
+        # Projections
+        self.proj_t5   = build_projection(self.t5_dim, self.bneck)
+        self.proj_clip = build_projection(self.clip_dim, self.bneck)
+        # Attention
+        self.cross_t2c = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.cross_c2t = nn.MultiheadAttention(self.bneck, self.heads, batch_first=True, dropout=do_p)
+        self.tau       = nn.Parameter(torch.full((self.heads, 1, 1), self.tau_init))
+        # Residual Pocket
+        self.pocket_blocks = nn.Sequential(
+            BottleneckResBlock(self.bneck, dropout=do_p),
+            BottleneckResBlock(self.bneck, dropout=do_p)
+        )
+        # Fuse
         self.fuse = nn.Sequential(
+            nn.LayerNorm(2 * self.bneck),
+            nn.Linear(2 * self.bneck, self.bneck * 2),
             nn.GELU(),
+            nn.Linear(self.bneck * 2, self.bneck)
         )
+        # Output Projections
+        self.anchor_proj = build_projection(self.bneck, self.clip_dim)
+        self.delta_proj  = build_projection(self.bneck, self.clip_dim)
+        self.logsig_proj = build_projection(self.bneck, self.clip_dim)
+        self.gate_proj = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, self.bneck),
             nn.GELU(),
+            nn.Linear(self.bneck, 1),
+            nn.Tanh(),
+            nn.Sigmoid()
         )
+        self.guidance_proj = nn.Sequential(
+            nn.LayerNorm(self.bneck),
+            nn.Linear(self.bneck, 1),
+            nn.Sigmoid()
+        )
+    def forward(self, t5_seq: torch.Tensor, clip_seq: torch.Tensor, config: dict = None):
+        if self.config.get("assert_input_dims", True):
+            assert t5_seq.size(-1) == self.t5_dim
+            assert clip_seq.size(-1) == self.clip_dim
+        max_guidance = self.max_guidance if config is None else config.get("max_guidance", 0.0)
+        if max_guidance <= 0:
+            max_guidance = self.max_guidance
+        if max_guidance <= 0:
+            max_guidance = 10
+        max_guidance = config.get("guidance_scale", 5.0)
+        t5_b   = self.proj_t5(t5_seq)
+        clip_b = self.proj_clip(clip_seq)
+        t2c, attn_t2c = self.cross_t2c(t5_b, clip_b, clip_b, need_weights=True, average_attn_weights=False)
+        c2t, attn_c2t = self.cross_c2t(clip_b, t5_b, t5_b, need_weights=True, average_attn_weights=False)
+        pocket = self.pocket_blocks(t2c)
+        pocket_mean = pocket.mean(1, keepdim=True).expand(-1, clip_b.size(1), -1)
+        h = self.fuse(torch.cat([pocket_mean, c2t], dim=-1))
+        anchor    = self.anchor_proj(h)
+        delta     = self.delta_proj(h) * self.gate_proj(h)
+        log_sigma = self.logsig_proj(h)
+        g_tok  = self.guidance_proj(h).squeeze(-1)
+        g_pred = g_tok.mean(1, keepdim=True) * max_guidance
+        return anchor, delta, log_sigma, attn_t2c, attn_c2t, self.tau, g_pred, self.gate_proj(h)
+from safetensors.torch import save_file, load_file
+def save_safetensors(adapter: nn.Module, path: str, metadata: dict = None):
+    """
+    Save the current adapter state to safetensors format.
+    All tensors are moved to CPU and saved as float32 for compatibility.
+    Optional metadata may be embedded (e.g., version, prompt_mode).
+    """
+    state = {k: v.float().cpu() for k, v in adapter.state_dict().items()}
+    save_file(state, path, metadata=metadata or {})
+    print(f"✅ Model saved to {path}")
+def load_safetensors(adapter: nn.Module, path: str, map_location="cpu"):
+    """
+    Load a safetensors checkpoint into the adapter.
+    Uses strict key matching. Tensors are loaded to the specified device.
+    """
+    state = load_file(path, device=map_location)
+    adapter.load_state_dict(state, strict=True)
+    print(f"✅ Model loaded from {path}")
+def load_converted_safetensors(adapter: nn.Module, path: str, map_location="cpu"):
+    """
+    Load a legacy-format adapter into the updated dual-shunt schema.
+    Converts key names according to DualConversionNames mapping.
+    """
+    state = load_file(path, device=map_location)
+    new_state = {}
+    rename_map = DualConversionNames.LAYER_NAMES
+    matched, renamed, skipped = 0, 0, 0
+    for key, tensor in state.items():
+        found = False
+        for old, new in rename_map.items():
+            if old in key:
+                new_key = key.replace(old, new)
+                new_state[new_key] = tensor
+                print(f"[MIGRATE] {key} → {new_key}")
+                renamed += 1
+                found = True
+                break
+        if not found:
+            if key in adapter.state_dict():
+                new_state[key] = tensor
+                matched += 1
+            else:
+                print(f"[SKIP]   {key} not found in target adapter.")
+                skipped += 1
+    adapter.load_state_dict(new_state, strict=False)
+    print(f"\n✅ Converted model loaded from {path}")
+    print(f"   🔁 Renamed Keys: {renamed}")
+    print(f"   ✅ Direct Matches: {matched}")
+    print(f"   ⚠️  Skipped Keys: {skipped}")
+def reshape_for_shunt(
+    encoder_embeddings: torch.Tensor,
+    clip_slice: torch.Tensor,
+    adapter_model
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Ensures encoder_embeddings and clip_slice match the required dimensions
+    for adapter_model: [B, adapter_seq, adapter_dim].
+    Applies sequence interpolation and feature projection as needed.
+    """
+    return encoder_embeddings, clip_slice
+    B, encoder_seq, encoder_dim = encoder_embeddings.shape
+    B2, clip_seq, clip_dim = clip_slice.shape
+    assert B == B2, "Batch sizes must match"
+    # -- Step 1: Interpolate SEQUENCE LENGTH (dim=1) if needed --
+    target_seq = max(adapter_model.condition_dim, adapter_model.modulation_dim)
+    if clip_seq != target_seq:
+        clip_slice = clip_slice.permute(0, 0, 2)  # [B, C, T]
+        clip_slice = torch.nn.functional.interpolate(
+            clip_slice.float(),
+            size=target_seq,
+            mode="nearest"
+        )
+        clip_slice = clip_slice.permute(0, 0, 2)  # [B, T, C]
+    if encoder_seq != target_seq:
+        encoder_embeddings = encoder_embeddings.permute(0, 0, 2)
+        encoder_embeddings = torch.nn.functional.interpolate(
+            encoder_embeddings.float(),
+            size=target_seq,
+            mode="nearest"
+        )
+        encoder_embeddings = encoder_embeddings.permute(0, 0, 2)
+    # -- Step 2: Project FEATURE DIMENSION (dim=2) if needed --
+    if clip_slice.size(-1) != adapter_model.condition_dim:
+        projection_clip = torch.nn.Linear(
+            clip_slice.size(-1),
+            adapter_model.condition_dim,
+            bias=True,
+            device=clip_slice.device
+        )
+        clip_slice = projection_clip(clip_slice)
+        del projection_clip
+    if encoder_embeddings.size(-1) != adapter_model.modulation_dim:
+        projection_encoder = torch.nn.Linear(
+            encoder_embeddings.size(-1),
+            adapter_model.modulation_dim,
+            bias=True,
+            device=encoder_embeddings.device
+        )
+        encoder_embeddings = projection_encoder(encoder_embeddings)
+        del projection_encoder
+    return encoder_embeddings, clip_slice