Harry Coultas Blum commited on
Commit
1a21598
·
1 Parent(s): c64babc

trying to cast

Browse files
Files changed (2) hide show
  1. requirements.txt +0 -1
  2. vui/inference.py +6 -29
requirements.txt CHANGED
@@ -7,7 +7,6 @@ numba
7
  numpy
8
  feedparser
9
  pydantic
10
- pyannote.audio
11
  soundfile
12
  tiktoken
13
  torchaudio
 
7
  numpy
8
  feedparser
9
  pydantic
 
10
  soundfile
11
  tiktoken
12
  torchaudio
vui/inference.py CHANGED
@@ -4,14 +4,12 @@ import time
4
  import inflect
5
  import torch
6
  import torch.nn.functional as F
7
- import torchaudio
8
  from torchaudio.transforms import Resample
9
  from torch import Tensor
10
  from torch.nn.attention import SDPBackend, sdpa_kernel
11
 
12
  from vui.model import Vui
13
  from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
14
- from vui.vad import detect_voice_activity as vad
15
 
16
  resample = Resample(22050, 16000).cuda()
17
 
@@ -154,7 +152,7 @@ def generate(
154
  ):
155
  text = simple_clean(text)
156
  with (
157
- torch.amp.autocast("cuda", torch.bfloat16, True),
158
  sdpa_kernel([SDPBackend.MATH]),
159
  ):
160
  t1 = time.perf_counter()
@@ -330,15 +328,8 @@ def render(
330
  )
331
  codes = codes[..., :-10]
332
  audio = self.codec.from_indices(codes)
 
333
 
334
- paudio = resample(audio[0])
335
- results = vad(paudio)
336
-
337
- if len(results):
338
- # Cut the audio based on VAD results, add 200ms silence at end
339
- s, e = results[0][0], results[-1][1]
340
- return audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
341
-
342
  raise Exception("Failed to render")
343
 
344
  # Otherwise we have to do some clever chaining!
@@ -374,24 +365,10 @@ def render(
374
  )
375
 
376
  codes = codes[..., :-10]
377
- audio = self.codec.from_indices(codes)
378
- # Resample for VAD
379
- paudio = torchaudio.functional.resample(audio[0], 22050, 16000)
380
-
381
- results = vad(paudio)
382
- run = len(results) == 0
383
-
384
- if len(results):
385
- prev_text = line
386
- # Cut the audio based on VAD results, add 200ms silence at end
387
- s, e = results[0][0], results[0][1]
388
- codes = codes[..., int(s * HZ) : int(e * HZ)]
389
- prev_codes = codes
390
- audio = audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
391
- audios.append(audio)
392
- else:
393
- prev_codes = orig_codes
394
- prev_text = ""
395
  except KeyboardInterrupt:
396
  break
397
  except RuntimeError as e:
 
4
  import inflect
5
  import torch
6
  import torch.nn.functional as F
 
7
  from torchaudio.transforms import Resample
8
  from torch import Tensor
9
  from torch.nn.attention import SDPBackend, sdpa_kernel
10
 
11
  from vui.model import Vui
12
  from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
 
13
 
14
  resample = Resample(22050, 16000).cuda()
15
 
 
152
  ):
153
  text = simple_clean(text)
154
  with (
155
+ torch.autocast("cuda", torch.bfloat16, True),
156
  sdpa_kernel([SDPBackend.MATH]),
157
  ):
158
  t1 = time.perf_counter()
 
328
  )
329
  codes = codes[..., :-10]
330
  audio = self.codec.from_indices(codes)
331
+ return audio
332
 
 
 
 
 
 
 
 
 
333
  raise Exception("Failed to render")
334
 
335
  # Otherwise we have to do some clever chaining!
 
365
  )
366
 
367
  codes = codes[..., :-10]
368
+ paudio = self.codec.from_indices(codes)
369
+ prev_text = line
370
+ prev_codes = codes
371
+ audios.append(paudio)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
372
  except KeyboardInterrupt:
373
  break
374
  except RuntimeError as e: