Spaces:
Running
on
Zero
Running
on
Zero
Harry Coultas Blum
commited on
Commit
·
1a21598
1
Parent(s):
c64babc
trying to cast
Browse files- requirements.txt +0 -1
- vui/inference.py +6 -29
requirements.txt
CHANGED
@@ -7,7 +7,6 @@ numba
|
|
7 |
numpy
|
8 |
feedparser
|
9 |
pydantic
|
10 |
-
pyannote.audio
|
11 |
soundfile
|
12 |
tiktoken
|
13 |
torchaudio
|
|
|
7 |
numpy
|
8 |
feedparser
|
9 |
pydantic
|
|
|
10 |
soundfile
|
11 |
tiktoken
|
12 |
torchaudio
|
vui/inference.py
CHANGED
@@ -4,14 +4,12 @@ import time
|
|
4 |
import inflect
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
7 |
-
import torchaudio
|
8 |
from torchaudio.transforms import Resample
|
9 |
from torch import Tensor
|
10 |
from torch.nn.attention import SDPBackend, sdpa_kernel
|
11 |
|
12 |
from vui.model import Vui
|
13 |
from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
|
14 |
-
from vui.vad import detect_voice_activity as vad
|
15 |
|
16 |
resample = Resample(22050, 16000).cuda()
|
17 |
|
@@ -154,7 +152,7 @@ def generate(
|
|
154 |
):
|
155 |
text = simple_clean(text)
|
156 |
with (
|
157 |
-
torch.
|
158 |
sdpa_kernel([SDPBackend.MATH]),
|
159 |
):
|
160 |
t1 = time.perf_counter()
|
@@ -330,15 +328,8 @@ def render(
|
|
330 |
)
|
331 |
codes = codes[..., :-10]
|
332 |
audio = self.codec.from_indices(codes)
|
|
|
333 |
|
334 |
-
paudio = resample(audio[0])
|
335 |
-
results = vad(paudio)
|
336 |
-
|
337 |
-
if len(results):
|
338 |
-
# Cut the audio based on VAD results, add 200ms silence at end
|
339 |
-
s, e = results[0][0], results[-1][1]
|
340 |
-
return audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
|
341 |
-
|
342 |
raise Exception("Failed to render")
|
343 |
|
344 |
# Otherwise we have to do some clever chaining!
|
@@ -374,24 +365,10 @@ def render(
|
|
374 |
)
|
375 |
|
376 |
codes = codes[..., :-10]
|
377 |
-
|
378 |
-
|
379 |
-
|
380 |
-
|
381 |
-
results = vad(paudio)
|
382 |
-
run = len(results) == 0
|
383 |
-
|
384 |
-
if len(results):
|
385 |
-
prev_text = line
|
386 |
-
# Cut the audio based on VAD results, add 200ms silence at end
|
387 |
-
s, e = results[0][0], results[0][1]
|
388 |
-
codes = codes[..., int(s * HZ) : int(e * HZ)]
|
389 |
-
prev_codes = codes
|
390 |
-
audio = audio[..., int(s * SR) : int((e + 0.2) * SR)].cpu()
|
391 |
-
audios.append(audio)
|
392 |
-
else:
|
393 |
-
prev_codes = orig_codes
|
394 |
-
prev_text = ""
|
395 |
except KeyboardInterrupt:
|
396 |
break
|
397 |
except RuntimeError as e:
|
|
|
4 |
import inflect
|
5 |
import torch
|
6 |
import torch.nn.functional as F
|
|
|
7 |
from torchaudio.transforms import Resample
|
8 |
from torch import Tensor
|
9 |
from torch.nn.attention import SDPBackend, sdpa_kernel
|
10 |
|
11 |
from vui.model import Vui
|
12 |
from vui.sampling import multinomial, sample_top_k, sample_top_p, sample_top_p_top_k
|
|
|
13 |
|
14 |
resample = Resample(22050, 16000).cuda()
|
15 |
|
|
|
152 |
):
|
153 |
text = simple_clean(text)
|
154 |
with (
|
155 |
+
torch.autocast("cuda", torch.bfloat16, True),
|
156 |
sdpa_kernel([SDPBackend.MATH]),
|
157 |
):
|
158 |
t1 = time.perf_counter()
|
|
|
328 |
)
|
329 |
codes = codes[..., :-10]
|
330 |
audio = self.codec.from_indices(codes)
|
331 |
+
return audio
|
332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
333 |
raise Exception("Failed to render")
|
334 |
|
335 |
# Otherwise we have to do some clever chaining!
|
|
|
365 |
)
|
366 |
|
367 |
codes = codes[..., :-10]
|
368 |
+
paudio = self.codec.from_indices(codes)
|
369 |
+
prev_text = line
|
370 |
+
prev_codes = codes
|
371 |
+
audios.append(paudio)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
372 |
except KeyboardInterrupt:
|
373 |
break
|
374 |
except RuntimeError as e:
|