preserve only few last kv

Browse files

Files changed (4) hide show

README.md +3 -1
audiocraft/builders.py +8 -14
audiocraft/transformer.py +13 -21
msinference.py +1 -1

README.md CHANGED Viewed

@@ -67,7 +67,9 @@ CUDA_DEVICE_ORDER=PCI_BUS_ID HF_HOME=/data/dkounadis/.hf7/ CUDA_VISIBLE_DEVICES=
 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
-<div><iframe width="560" height="315" src="https://www.youtube.com/embed/2YjxAPkdXIc?si=eVpClu_7whMAdWi0" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture; web-share" referrerpolicy="strict-origin-when-cross-origin" allowfullscreen></iframe></div>
 </details>

 Following examples need `api.py` to be running. [Set this IP](https://huggingface.co/dkounadis/artificial-styletts2/blob/main/tts.py#L85) to the IP shown when starting `api.py`.
+```
+python tts.py --text assets/ocr.txt --image assets/ocr.jpg --soundscape "battle hero" --voice romanian
+```
 </details>

audiocraft/builders.py CHANGED Viewed

@@ -11,15 +11,13 @@ from .lm import LMModel
 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
-N_REPEAT = 7  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
-    # [bs, samples] shift circular each batch elem of sound
-    n = x.shape[1]
-    for i, batch_elem in enumerate(x):
-        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
-        x[i, :] = torch.roll(batch_elem, offset, dims=0)  # batch_elem = [400000, ]
-    return x
 def _delete_param(cfg, full_name):
     parts = full_name.split('.')
@@ -70,18 +68,14 @@ class AudioGen(nn.Module):
             # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
-            x = self.resample_fn(x)
-            # batch size = different sounds for same txt
-            x = x.repeat(1, N_REPEAT)
-            # less periodic - shift every batch elem
             for _ in range(7):
                 x = _shift(x)
-            x = x.reshape(-1)
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)

 from .seanet import SEANetDecoder
 from .vq import ResidualVectorQuantizer
+N_REPEAT = 3  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
+    n = x.shape[0]
+    offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+    return torch.roll(x, offset, dims=0)
 def _delete_param(cfg, full_name):
     parts = full_name.split('.')
             # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
+            x = self.resample_fn(x)  # [N_REPEAT, duration]
+            x = x.repeat(1, N_REPEAT).reshape(-1)
             for _ in range(7):
                 x = _shift(x)
             print(x.abs().max(), 'MAX')
             return x / (x.abs().max() + 1e-7)

audiocraft/transformer.py CHANGED Viewed

@@ -3,8 +3,8 @@ import torch.nn as nn
 from torch.nn import functional as F
 from einops import rearrange
-def create_sin_embedding(positions,
-                         dim,
                          max_period = 10000,
                          dtype = torch.float32):
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
@@ -78,28 +78,20 @@ class StreamingMultiheadAttention(nn.Module):
             if self.k_history is not None:
-                # k_history.shape = torch.Size([2*N_REPEAT, 24, 3, 64])  FOR cfg > k.shape=torch.Size([2, 24, 1, 64])
-                # 24 heads 64 dim
                 self.k_history = torch.cat([self.k_history, k], 2)  # IF ctrl^c here during live demo it is non-atomic k!=v
                 self.v_history = torch.cat([self.v_history, v], 2)  # thus it will try to continue with incompatible k/v dims!
-                # Preserve first 4-10 tokens & flush kv
-                if self.k_history.shape[2] > 24:
-                    # find LOWEST l2 norm of keys > https://arxiv.org/pdf/2406.11430v4
-                    low_norm = (self.k_history * self.k_history).mean(3, keepdims=True).sum(1, keepdims=True)   # [bs, 24, T, 64] -> [bs, T]
-                    _, _ix = torch.topk(low_norm, k=10, dim=2, largest=False)  # shows background music due to cfg - looses the txt conditioning if flushed!
-                    _ix = _ix.repeat(1, 24, 1, 64)
-                    # print(_ix.shape)
-                    self.k_history = torch.gather(self.k_history, 2, _ix)
-                    self.v_history = torch.gather(self.v_history, 2, _ix)
-            else:
-                # init on 1st token (for all 47 transf layers)
-                print(f'AudioGen kv cache Flush')
                 self.k_history = k
-                self.v_history = v
             k = self.k_history
             v = self.v_history

 from torch.nn import functional as F
 from einops import rearrange
+def create_sin_embedding(positions,
+                         dim,
                          max_period = 10000,
                          dtype = torch.float32):
     """Create sinusoidal positional embedding, with shape `[B, T, C]`.
             if self.k_history is not None:
+                # flush
+                if self.k_history.shape[2] > 71:
+                    self.k_history = torch.cat([self.k_history[:, :, :4, :], self.k_history[:, :, -1:, :]], 2)
+                    self.v_history = torch.cat([self.v_history[:, :, :4, :], self.v_history[:, :, -1:, :]], 2)
+                # fill new k/v
                 self.k_history = torch.cat([self.k_history, k], 2)  # IF ctrl^c here during live demo it is non-atomic k!=v
                 self.v_history = torch.cat([self.v_history, v], 2)  # thus it will try to continue with incompatible k/v dims!
+            else:
+                # init
                 self.k_history = k
+                self.v_history = v
+            # For self attn prepare
             k = self.k_history
             v = self.v_history

msinference.py CHANGED Viewed

@@ -390,7 +390,7 @@ def foreign(text=None,   # split sentences here so we can prepend a txt for germ
             x = net_g(input_ids=inputs.input_ids.to(device),
                            attention_mask=inputs.attention_mask.to(device),
-                           speed = .94 + .4 * np.random.rand()  # variable speed / sentence
                            )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()

             x = net_g(input_ids=inputs.input_ids.to(device),
                            attention_mask=inputs.attention_mask.to(device),
+                           speed = .94 + .84 * np.random.rand()  # variable speed / sentence
                            )[0, :]
             # crop the 1st audio - is PREFIX text 156000 samples to chose deu voice / VitsAttention()