dkounadis
/

artificial-styletts2

@@ -1,26 +1,25 @@
-import omegaconf
-import torchaudio
 import torch
 from torch import nn
 import numpy as np
 from huggingface_hub import hf_hub_download
 import os
-from omegaconf import OmegaConf
-from .encodec import EncodecModel
-from .lm import LMModel
-from .seanet import SEANetDecoder
-from .vq import ResidualVectorQuantizer
-# torch.backends.cudnn.deterministic = True
 N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
-    n = len(x)
-    offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
-    if isinstance(x, torch.Tensor):
-        return torch.roll(x, offset, dims=0)
-    elif isinstance(x, str):
-        return x[offset:] + x[:offset]   #np.roll(x, offset)
 class AudioGen(torch.nn.Module):
@@ -29,18 +28,13 @@ class AudioGen(torch.nn.Module):
     def __init__(self):
         super().__init__()
-        # self.autocast = TorchAutocast(
-        #         enabled=True, device_type='cuda', dtype=torch.float16)
-        # Vocoder
         _file_1 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
             filename="compression_state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
             library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
-        pkg = torch.load(_file_1, map_location='cpu')
-        # kwargs = OmegaConf.create(pkg['xp.cfg'])
-        # kwargs.device = 'cpu'
         decoder = SEANetDecoder()
         quantizer = ResidualVectorQuantizer()
         self.compression_model = EncodecModel(decoder=decoder,
@@ -50,12 +44,10 @@ class AudioGen(torch.nn.Module):
                                               sample_rate=16000,
                                               channels=1,
                                               causal=False)  #.to(cfg.device)
-        # self.compression_model = self.get_compression_model(cfg)
-        self.compression_model.load_state_dict(pkg['best_state'], strict=False)  # ckpt has also unused encoder weights
-        self.resample_fn = torchaudio.transforms.Resample(16000, 24000)  #  AudioGen = 16KHZ                StyleTTS2 = 24 KHz / MMSTTS = 24 KHz
-        # # T5 &
         #  LM
         _file_2 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
             filename="state_dict.bin",
@@ -65,37 +57,30 @@ class AudioGen(torch.nn.Module):
         pkg = torch.load(_file_2, map_location='cpu')
         cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
         _best = pkg['best_state']
-        # _best['condition_provider.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')
-        # _best['condition_provider.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')
         _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
         _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
-        self.lm = LMModel() #to(torch.float16)
-        self.lm.load_state_dict(pkg['best_state'],
-                              strict=True)
-        #
         self.lm.eval()
-        self.compression_model.eval()
     @torch.no_grad()
     def generate(self,
                  prompt='dogs mewo',
-                 duration=2.24,  ## seconds of audio
                  ):
         with torch.autocast(device_type='cuda', dtype=torch.float16):
             gen_tokens = self.lm.generate(
-                text_condition=[prompt] + [prompt[:10] + _shift(prompt) for _ in range(N_REPEAT-1)]  + [''] * N_REPEAT,  # '' for null condition,  # ['trance', 'dogs meow', '', '']
-                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)) # [bs, 4, 37 * self.lm.n_draw]
         x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
-        x = x[:, 0, :]  # last samples have splash sounds DISCARD 25000 last samples
-        # AudioGen 16KHZ / StyleTTS2 24 KHz / MMSTTS 24 KHz
-        x = self.resample_fn(x)  # [N_REPEAT, duration]
-        x = x.reshape(-1)
-        # for _ in range(7):
-        #      x = _shift(x)
-        return x #x / (x.abs().max() + 1e-7)

 import torch
 from torch import nn
+from omegaconf import OmegaConf
 import numpy as np
 from huggingface_hub import hf_hub_download
 import os
+from audiocraft.encodec import EncodecModel
+from audiocraft.lm import LMModel
+from audiocraft.seanet import SEANetDecoder
+from audiocraft.vq import ResidualVectorQuantizer
 N_REPEAT = 2  # num (virtual batch_size) clones of audio sounds
 def _shift(x):
+    #print(x.shape, 'BATCH Independent SHIFT\n AudioGen')
+    for i, _slice in enumerate(x):
+        n = x.shape[2]
+        offset = np.random.randint(.24 * n, max(1, .74 * n))  # high should be above >= 0 TBD
+        print(offset)
+        x[i, :, :] = torch.roll(_slice, offset, dims=1)  # _slice 2D
+    return x
 class AudioGen(torch.nn.Module):
     def __init__(self):
         super().__init__()
         _file_1 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
             filename="compression_state_dict.bin",
             cache_dir=os.environ.get('AUDIOCRAFT_CACHE_DIR', None),
             library_name="audiocraft",
             library_version= '1.3.0a1')  # Found at __init__.py #audiocraft.__version__)
+        pkg = torch.load(_file_1, map_location='cpu')# kwargs = OmegaConf.create(pkg['xp.cfg'])
         decoder = SEANetDecoder()
         quantizer = ResidualVectorQuantizer()
         self.compression_model = EncodecModel(decoder=decoder,
                                               sample_rate=16000,
                                               channels=1,
                                               causal=False)  #.to(cfg.device)
+        self.compression_model.load_state_dict(pkg['best_state'], strict=False)
+        self.compression_model.eval()  # ckpt has also unused encoder weights
+        #  T5 &
         #  LM
         _file_2 = hf_hub_download(
             repo_id='facebook/audiogen-medium',
             filename="state_dict.bin",
         pkg = torch.load(_file_2, map_location='cpu')
         cfg = OmegaConf.create(pkg['xp.cfg'])  # CFG inside torch bin
         _best = pkg['best_state']
         _best['t5.output_proj.weight'] = _best.pop('condition_provider.conditioners.description.output_proj.weight')#.to(torch.float)
         _best['t5.output_proj.bias'] = _best.pop('condition_provider.conditioners.description.output_proj.bias')#.to(torch.float)
+        self.lm = LMModel()
+        self.lm.load_state_dict(pkg['best_state'], strict=True)
         self.lm.eval()
     @torch.no_grad()
     def generate(self,
                  prompt='dogs mewo',
+                 duration=2.24,  # seconds of audio
                  ):
+        torch.manual_seed(42)  # https://github.com/facebookresearch/audiocraft/issues/111#issuecomment-1614732858
+        self.lm.n_draw = int(duration / 12) + 1  # different beam every 7 seconds of audio
         with torch.autocast(device_type='cuda', dtype=torch.float16):
             gen_tokens = self.lm.generate(
+                text_condition=[prompt] * N_REPEAT  + [''] * N_REPEAT,#['dogs', 'dogs...!', '', '']
+                max_tokens=int(duration / (N_REPEAT * self.lm.n_draw) * self.compression_model.frame_rate)
+                ) # [bs, 4, 74 * self.lm.n_draw]
         x = self.compression_model.decode(gen_tokens, None)   #[bs, 1, 11840]
+        for _ in range(7):  # perhaps shift is too random as already lm.n_draw has randomness
+               x = _shift(x)
+        return x.reshape(-1) #x / (x.abs().max() + 1e-7)

audiocraft/lm.py CHANGED Viewed

@@ -6,14 +6,14 @@ from transformers import T5EncoderModel, T5Tokenizer  # type: ignore
 class T5(nn.Module):
     def __init__(self):
-        # run this from within lm so it autocasts thus match exact values of t5 in official audiogen
         super().__init__()
         self.output_proj = nn.Linear(1024,  # t5-large
                                      1536)  # lm hidden
         self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
         t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
-        # this makes sure that the t5 models is not part
         # of the saved checkpoint
         self.__dict__['t5'] = t5.to('cuda:0')
@@ -28,9 +28,8 @@ class T5(nn.Module):
             x = self.t5(input_ids=d['input_ids'],
                             attention_mask=d['attention_mask']).last_hidden_state  # no kv
-            # output_proj as float32
-            print('BEF PROJ',x[0, :, :].sum(), x[1, :, :].sum(), self.output_proj.weight.sum(), self.output_proj.weight.dtype, self.output_proj.bias.sum(), 'GEN\n\n143')
         x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
         x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
         return x
@@ -41,71 +40,55 @@ class LMModel(nn.Module):
     def __init__(self,
                  n_q = 4,
                  card = 2048,
-                 dim = 1536,
-                 num_heads = 24,
-                 hidden_scale = 4,  # FFN of Transformer
                  ):
         super().__init__()
         self.t5 = T5()
-        self.card = card  # 2048 ?
-        self.n_draw = 1  # draw additional tokens at each call:
-        #  Batch size is slower  than n_draw as it calls the transformer on larger batch
-        # n_draw instead draws more tokens/phonemes from torch.multinomial - after execution of lm
-        embed_dim = self.card + 1
-        self.n_q = n_q
-        self.dim = dim
-        self.emb = nn.ModuleList([nn.Embedding(embed_dim, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
-        self.transformer = StreamingTransformer(
-            d_model=dim,
-            num_heads=num_heads,
-            dim_feedforward=int(hidden_scale * dim),
-            num_layers=48,
-            positional_embedding='sin',
-            )
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
     def forward(self,
                 sequence,
                 condition_tensors=None,
-                token_count=None):
         bs, n_q, time_frames = sequence.shape # [bs, 4, time]
-        input_ = sum([self.emb[k](sequence[:, k]) for k in range(self.n_q)])
         out = self.transformer(torch.cat([input_, input_], 0),  # duplicate null condition (bs x 2) for ClassifierFreeGuidance
                                cross_attention_src=condition_tensors,
-                               token_count=token_count
                                )
-        logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(self.n_q)], dim=1)#[2*bs,4,1,2048]
-        logits = 3 * logits[:bs, :, :, :] - 2 * logits[bs:, :, :, :]  # [3, 4, 1, 2048]
-        # SAMPLE TOP K
-        k = 400 # 450 is nice sound still train honk is clear!
-        p = torch.softmax(logits, dim=3)
-        top_k_value, _ = torch.topk(p, k, dim=3)  # [3, 4, 1, k]
-        min_value_top_k = top_k_value[:, :, :, -1:]
-        p *= (p >= min_value_top_k).float()   # zero low probs
-        p.div_(p.sum(dim=-1, keepdim=True))   # renormalise on non-zero probs
-        # BRING THE nq = 4 IN BATCH
-        p = p.reshape(bs * self.n_q, 2048)
-        out = torch.multinomial(p,  # p=[bs,2048], out=[bs, num_samples]
-                                num_samples=self.n_draw,
-                                replacement=False)  # [bs*4, self.n_draw]
-        # print('DRAW','c', out)
-        return out.reshape(bs, self.n_q, self.n_draw).transpose(1,2)  # [bs=3not6, self.n_draw, 4]
     @torch.no_grad()
     def generate(self,
                  max_tokens=None,
-                 text_condition=None
-                 ):
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
         out_codes = torch.full((bs,
                                 self.n_draw,
                                 4,
@@ -113,14 +96,15 @@ class LMModel(nn.Module):
                                self.card,
                                dtype=torch.long,
                                device=x.device) # [bs, n_draw, 4, dur]
-        # =========================================
         for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=x,  # utilisation of the attention mask of txt condition ?
-                                      token_count=offset)  # [bs, n_draw, 4]
             # Fill of next_token should be also placed on antidiagonal [not column]
@@ -133,11 +117,11 @@ class LMModel(nn.Module):
             #   [2048, 2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]]
             # NO OVerWriting
             if offset == 0:
                 next_token[:, :, 1:4] = 2048  # self.card - bottom 3 entries of the antidiagonal should remain 2048
             elif offset == 1:
                 next_token[:, :, 2:4] = 2048  # bottom 2 entries of the antidiagonal should remain 2048
             elif offset == 2:
@@ -157,16 +141,22 @@ class LMModel(nn.Module):
                 next_token[:, :, 0:3] = 2048
             else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
-        print('\nFULL FINAL TOKENS UNFILT\n', out_codes[:, 0, :, 4:max_tokens+4], out_codes[0, 0, :, 4:max_tokens+4].shape)
-        # EXTRACT COLUMNS AS ALIGN IS ALREADY DONE by FILLING DIAGONALLY
-        out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)   # [bs, 4, duration*n_draw] DISCARD FILL 2048
-        for lay in self.transformer.layers:
-             lay.self_attn.k_history = None
-             lay.self_attn.v_history = None
-        return out_codes  # SKIP THE 4 fill 2048 bs*n_draw, duration -> repeat/shift in api.py

 class T5(nn.Module):
     def __init__(self):
         super().__init__()
         self.output_proj = nn.Linear(1024,  # t5-large
                                      1536)  # lm hidden
         self.t5_tokenizer = T5Tokenizer.from_pretrained('t5-large', legacy=True)
         t5 = T5EncoderModel.from_pretrained('t5-large').train(mode=False)
+        # this makes sure that the t5 is not part
         # of the saved checkpoint
         self.__dict__['t5'] = t5.to('cuda:0')
             x = self.t5(input_ids=d['input_ids'],
                             attention_mask=d['attention_mask']).last_hidden_state  # no kv
+        # Float 16
+        # > self.output_proj() is outside of autocast of t5 - however inside the autocast of lm thus computed in torch.float16
         x = self.output_proj(x)  # nn.Linear() - produces different result if there is no duplicate txt condition here
         x[bs:, :, :] = 0  # venv/../site-packages/audiocraft/modules/conditioners.py -> tokenize()
         return x
     def __init__(self,
                  n_q = 4,
                  card = 2048,
+                 dim = 1536
                  ):
         super().__init__()
         self.t5 = T5()
+        self.card = card # 2048
+        self.n_draw = 1  # draw > 1 tokens of different CFG scale
+                         # batch size > 1 is slower from n_draw as calls transformer on larger batch
+        self.emb = nn.ModuleList([nn.Embedding(self.card + 1, dim) for _ in range(n_q)])  # EMBEDDING HAS 2049
+        self.transformer = StreamingTransformer()
         self.out_norm = nn.LayerNorm(dim, eps=1e-5)
         self.linears = nn.ModuleList([nn.Linear(dim, self.card, bias=False) for _ in range(n_q)])  # LINEAR DOESNT HAVE 2049
     def forward(self,
                 sequence,
                 condition_tensors=None,
+                cache_position=None):
         bs, n_q, time_frames = sequence.shape # [bs, 4, time]
+        input_ = sum([self.emb[k](sequence[:, k]) for k in range(n_q)])
         out = self.transformer(torch.cat([input_, input_], 0),  # duplicate null condition (bs x 2) for ClassifierFreeGuidance
                                cross_attention_src=condition_tensors,
+                               cache_position=cache_position
                                )
+        logits = torch.stack([self.linears[k](self.out_norm(out)) for k in range(n_q)], dim=1) # [2*bs, 4, 1,      2048]
+        logits = 3 * logits[:bs, :, :, :] - self._scale * logits[bs:, :, :, :]                 # [  bs, 4, n_draw, 2048]
+        k = 24
+        logits = torch.softmax(logits / 1.0, dim=3)  # [bs, 4, 1, 2048]
+        p, ix = torch.topk(logits, k, dim=3)  # p = [bs, 4, 1, 24], ix = [bs, 4, 1, 2048]
+        # Exponential Distribution
+        deflation = torch.empty_like(p).exponential_(lambd=1)
+        p = p / deflation
+        # divide large probs with exp(prob) If prob=.001 then 1/exp(1*.001) -> almost by 0  --> exp doesnt really produce (0, Inf)
+        p = p.argmax(dim=3, keepdim=True)  # [bs, 4, n_draw, 24]
+        tok = ix.gather(dim=3, index=p).to(torch.int64)  # [bs, 4, n_draw, 1]
+        return tok[:, :, :, 0].transpose(1, 2)  # [bs, n_draw, 4]
     @torch.no_grad()
     def generate(self,
                  max_tokens=None,
+                 text_condition=None):
         x = self.t5(text_condition)
         bs = x.shape[0] // 2  # has null conditions - bs*2*N_REPEAT applys in builders.py
+        self._scale = .3 * torch.rand(1, 1, self.n_draw, 1, device=x.device) + 1.94
+        cache_position = 0
         out_codes = torch.full((bs,
                                 self.n_draw,
                                 4,
                                self.card,
                                dtype=torch.long,
                                device=x.device) # [bs, n_draw, 4, dur]
+        # A/R
         for offset in range(0, max_tokens + 4 - 1):  # max_tokens + n_q - 1
             # extract diagonal via indexing out_codes[ [0, 1, 2, 3], [0, 1, 2, 3] ]
             next_token = self.forward(out_codes[:, 0, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset][:, :, None],  # index diagonal & exapnd to [bs, n_q, dur=1]
                                       #gen_sequence[:, 0, :, offset-1:offset],  # DIAGINDEXING for setting prediction of lm into gen_sequence THE GENSEQUENCE has to be un-delayed in the end [Because it has to be de-delayed for the vocoder then is actually only the lm input that requires to see the delay thus we could just feed by diaggather] so it matches gen_codes -1 a[[0, 1, 2, 3], torch.tensor([0, 1, 2, 3]) + 5]  the gen_sequence is indexed by vertical column and fed to lm however the prediction of lm is place diagonally with delay to the gen_sequence
                                       condition_tensors=x,  # utilisation of the attention mask of txt condition ?
+                                      cache_position=cache_position)  # [bs, n_draw, 4]
             # Fill of next_token should be also placed on antidiagonal [not column]
             #   [2048, 2048, 2048, 2048, 2048, 2048, 2048,    0,    1,    2,    3,    4,    5,    6]]
             # NO OVerWriting
             if offset == 0:
                 next_token[:, :, 1:4] = 2048  # self.card - bottom 3 entries of the antidiagonal should remain 2048
             elif offset == 1:
                 next_token[:, :, 2:4] = 2048  # bottom 2 entries of the antidiagonal should remain 2048
             elif offset == 2:
                 next_token[:, :, 0:3] = 2048
             else:  # offset 3,4,5,6,7...... max_tokens-1   # FILL Complete n_q = 4 ANTIDIAGONAL ENTRIES
                 pass #print('No delete anti-diag')
             out_codes[:, :, [0, 1, 2, 3], torch.tensor([3, 2, 1, 0]) + offset + 1] = next_token
+            # Sink Attn
+            if (offset > 0) and (offset % 71) == 0:
+                n_preserve = 4
+                self.transformer._flush(n_preserve=n_preserve)
+                cache_position = n_preserve
+            else:
+                cache_position += 1
+        # [bs, n_draw, 4, time+xtra] -> [bs, 4, n_draw, time] ->  [bs, 4, time * n_draw]
+        out_codes = out_codes[:, :, :, 4:max_tokens+4].transpose(1, 2).reshape(bs, 4, self.n_draw * max_tokens)
+        # flush for next API call
+        self.transformer._flush()
+        return out_codes  # SKIP THE 4 fill 2048

audiocraft/transformer.py CHANGED Viewed

@@ -5,17 +5,23 @@ from einops import rearrange
 torch.backends.cuda.enable_mem_efficient_sdp(True)
-def create_sin_embedding(positions,
                          dim,
                          max_period=10000
                          ):
-    assert dim % 2 == 0
     half_dim = dim // 2
     positions = positions.to(torch.float)
-    adim = torch.arange(half_dim, device=positions.device, dtype=torch.float).view(1, 1, -1)
-    max_period_tensor = torch.full([], max_period, device=positions.device, dtype=torch.float)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
-    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)  # OFFICIAL is torch.float32 HOWEVER self_attn.in_prod_weight = torch.float16
 class StreamingMultiheadAttention(nn.Module):
@@ -23,19 +29,20 @@ class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
                  num_heads,
-                 cross_attention = False,
                  ):
         super().__init__()
         self.cross_attention = cross_attention
-        self.embed_dim = embed_dim
-        self.k_history = None  # previous k from the previous tokens seen in the current generation - only for selt.attn
-        self.v_history = None  # clean up IN LM after finishing GENERATION - Each 1...47 mha has different kv history
         self.num_heads = num_heads
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
-                                                           dtype=torch.float))
     def forward(self,
                 query,
@@ -44,15 +51,16 @@ class StreamingMultiheadAttention(nn.Module):
         layout = "b h t d"
         if self.cross_attention:
-            # Different queries, keys, values, we have to spit manually the in_proj_weight
             dim = self.in_proj_weight.shape[0] // 3
             q = nn.functional.linear(query, self.in_proj_weight[:dim])
             k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
-            q, k, v = [rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
         else:
             # 1st projected makes k,v (instantaneous)
@@ -60,59 +68,45 @@ class StreamingMultiheadAttention(nn.Module):
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
-            projected = nn.functional.linear(query, self.in_proj_weight, None)  # here we have different floating values from official
             # print(query.sum(), projected.sum() , self.in_proj_weight.sum(), 'Lc')   # verified official AudioGen values
             bound_layout = "b h p t d"
-            packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
-                # flush
-                if self.k_history.shape[2] > 71:
-                    self.k_history = torch.cat([self.k_history[:, :, :4, :], self.k_history[:, :, -1:, :]], 2)
-                    self.v_history = torch.cat([self.v_history[:, :, :4, :], self.v_history[:, :, -1:, :]], 2)
-                # fill new k/v
-                self.k_history = torch.cat([self.k_history, k], 2)  # IF ctrl^c here during live demo it is non-atomic k!=v
-                self.v_history = torch.cat([self.v_history, v], 2)  # thus it will try to continue with incompatible k/v dims!
             else:
-                # init
                 self.k_history = k
                 self.v_history = v
-            # For self attn prepare
             k = self.k_history
             v = self.v_history
-            # KV COMPLETION ONLY ON SELF ATTENTION
         x = torch.nn.functional.scaled_dot_product_attention(
-            q, k, v, is_causal=False, dropout_p=0)
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
-class StreamingTransformerLayer(nn.TransformerEncoderLayer):
     def __init__(self,
                  d_model,
                  num_heads,
                  dim_feedforward):
-        super().__init__(d_model,
-                         num_heads,
-                         dim_feedforward=dim_feedforward,
-                         dropout=0.0,
-                         device='cuda',
-                         dtype=torch.float32,
-                         batch_first=True,
-                         norm_first=True,
-                         activation='gelu')
-        # super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
@@ -125,15 +119,14 @@ class StreamingTransformerLayer(nn.TransformerEncoderLayer):
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
     def forward(self,
                 x,
-                cross_attention_src=None):  # txtcond
         x = x + self.self_attn(self.norm1(x))
-        x = x + self.cross_attention(query = self.norm_cross(x),
-                                     key   = cross_attention_src,
-                                     value = cross_attention_src)  # txtcondition
-        x = x + self.linear2(F.gelu(self.linear1(   self.norm2(x)    )))
         return x
@@ -143,39 +136,38 @@ class StreamingTransformer(nn.Module):
                  d_model=1536,
                  num_heads=24,
                  num_layers=48,
-                 dim_feedforward=6144,
-                 cross_attention = True,
-                 positional_embedding: str = 'sin',
-                 max_period: float = 10_000
-                 ):
         super().__init__()
-        assert d_model % num_heads == 0
-        self.positional_embedding = positional_embedding
-        self.max_period = max_period
-        self.layers = nn.ModuleList()
-        for idx in range(num_layers):
-            self.layers.append(
-                StreamingTransformerLayer(
-                    d_model=d_model,
-                    num_heads=num_heads,
-                    dim_feedforward=dim_feedforward
-                    )
-            )
     def forward(self,
                 x,
-                token_count=None,
                 cross_attention_src=None):
-        if self.positional_embedding in ['sin', 'sin_rope']:
-            pos_emb = create_sin_embedding(torch.zeros(x.shape[0], 1, 1, device=x.device) + token_count,
-                                           1536,
-                                           max_period=self.max_period)
-            x = x + pos_emb
-        for j, lay in enumerate(self.layers):
-            x = lay(x, cross_attention_src=cross_attention_src)  # cross_attention_src = txt-cond x audio
-                                                                 # self attn = audio x audio
-                                                                 # Every layer (mha) keeps itsw own kv cachE
         return x

 torch.backends.cuda.enable_mem_efficient_sdp(True)
+def create_sin_embedding(positions,
                          dim,
                          max_period=10000
                          ):
+    # assert dim % 2 == 0
     half_dim = dim // 2
     positions = positions.to(torch.float)
+    adim = torch.arange(half_dim, device=positions.device,
+                        dtype=torch.float).view(1, 1, -1)
+    max_period_tensor = torch.full([],
+                                   max_period,
+                                   device=positions.device,
+                                   dtype=torch.float)  # avoid sync point
     phase = positions / (max_period_tensor ** (adim / (half_dim - 1)))
+    # OFFICIAL is torch.float32 HOWEVER self_attn.in_prod_weight = torch.float16
+    return torch.cat([torch.cos(phase), torch.sin(phase)], dim=-1)
 class StreamingMultiheadAttention(nn.Module):
     def __init__(self,
                  embed_dim,
                  num_heads,
+                 cross_attention=False,
                  ):
         super().__init__()
         self.cross_attention = cross_attention
+        # if not self.cross_attention then it has kvcachingn
+        self.k_history = None
+        # cleanup history through LM inside GENERATION - Each 0,..,47 mha has different kv history
+        self.v_history = None
         self.num_heads = num_heads
         self.out_proj = nn.Linear(embed_dim, embed_dim, bias=False)
         self.register_buffer('in_proj_weight', torch.ones((3 * embed_dim, embed_dim),
+                                                          dtype=torch.float))
     def forward(self,
                 query,
         layout = "b h t d"
         if self.cross_attention:
+            # Different queries, keys, values > split in_proj_weight
             dim = self.in_proj_weight.shape[0] // 3
             q = nn.functional.linear(query, self.in_proj_weight[:dim])
             k = nn.functional.linear(key,   self.in_proj_weight[dim: 2 * dim])
             v = nn.functional.linear(value, self.in_proj_weight[2 * dim:])
+            q, k, v = [
+                rearrange(x, f"b t (h d) -> {layout}", h=self.num_heads) for x in [q, k, v]]
         else:
             # 1st projected makes k,v (instantaneous)
             # HISTORY - DIFFERENT FOR EACH TRANSF LAYER
+            # here we have different floating values from official
+            projected = nn.functional.linear(query, self.in_proj_weight, None)
             # print(query.sum(), projected.sum() , self.in_proj_weight.sum(), 'Lc')   # verified official AudioGen values
             bound_layout = "b h p t d"
+            packed = rearrange(
+                projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
             q, k, v = packed.unbind(dim=2)
             if self.k_history is not None:
+                # IF ctrl^c during live_demo the assigning of each of kv is non-atomic k!=v
+                # thus it will try to continue with incompatible k/v dims!
+                self.k_history = torch.cat([self.k_history, k], 2)
+                self.v_history = torch.cat([self.v_history, v], 2)
             else:
                 self.k_history = k
                 self.v_history = v
+            # Assign Completed k / v to k / v
             k = self.k_history
             v = self.v_history
+            # -> kv CACHE ONLY APPLIES if not self.cross_attention
         x = torch.nn.functional.scaled_dot_product_attention(
+            q, k, v, attn_mask=None, is_causal=False, dropout_p=0.0)
         x = rearrange(x, f"{layout} -> b t (h d)", h=self.num_heads)
         x = self.out_proj(x)
         return x
+class StreamingTransformerLayer(nn.Module):
     def __init__(self,
                  d_model,
                  num_heads,
                  dim_feedforward):
+        super().__init__()
         self.self_attn = StreamingMultiheadAttention(embed_dim=d_model,
                                                      num_heads=num_heads)
         self.norm1 = nn.LayerNorm(d_model, eps=1e-5)
         self.norm2 = nn.LayerNorm(d_model, eps=1e-5)
     def forward(self,
                 x,
+                cross_attention_src=None):
         x = x + self.self_attn(self.norm1(x))
+        x = x + self.cross_attention(query=self.norm_cross(x),
+                                     key=cross_attention_src,
+                                     value=cross_attention_src)  # txtcondition
+        x = x + self.linear2(F.gelu(self.linear1(self.norm2(x))))
         return x
                  d_model=1536,
                  num_heads=24,
                  num_layers=48,
+                 dim_feedforward=6144):
         super().__init__()
+        self.layers = nn.ModuleList(
+                [
+                    StreamingTransformerLayer(d_model=d_model,
+                                              num_heads=num_heads,
+                                              dim_feedforward=dim_feedforward) for _ in range(num_layers)
+                    ]
+                )
     def forward(self,
                 x,
+                cache_position=None,
                 cross_attention_src=None):
+        x = x + create_sin_embedding(
+                torch.zeros(x.shape[0], 1, 1, device=x.device) + cache_position, 1536)
+        for lay in self.layers:
+            x = lay(x,
+                    cross_attention_src=cross_attention_src)
         return x
+    def _flush(self,
+               n_preserve=None):
+        for lay in self.layers:
+            if n_preserve is not None:
+                # cache position is difficult to choose to also preserve kv from end
+                lay.self_attn.k_history = lay.self_attn.k_history[:, :, :n_preserve, :]
+                lay.self_attn.v_history = lay.self_attn.v_history[:, :, :n_preserve, :]
+            else:
+                lay.self_attn.k_history = None
+                lay.self_attn.v_history = None