Update app.py
Browse files
app.py
CHANGED
@@ -14,12 +14,11 @@ from generate_audio import (
|
|
14 |
)
|
15 |
import uuid
|
16 |
|
17 |
-
|
18 |
-
device = "cuda" if torch.cuda.is_available() else "cpu"
|
19 |
vq_model = RQBottleneckTransformer.load_model(
|
20 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
21 |
).to(device)
|
22 |
-
|
23 |
use_8bit = False
|
24 |
llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
|
25 |
tokenizer = AutoTokenizer.from_pretrained(llm_path)
|
@@ -31,12 +30,12 @@ if use_8bit:
|
|
31 |
llm_int8_has_fp16_weight=False,
|
32 |
)
|
33 |
else:
|
34 |
-
model_kwargs["torch_dtype"] = torch.
|
35 |
model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
|
36 |
|
37 |
-
@spaces.
|
38 |
def audio_to_sound_tokens_whisperspeech(audio_path):
|
39 |
-
vq_model.ensure_whisper(
|
40 |
wav, sr = torchaudio.load(audio_path)
|
41 |
if sr != 16000:
|
42 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
@@ -47,9 +46,9 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
|
|
47 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
48 |
return f'<|sound_start|>{result}<|sound_end|>'
|
49 |
|
50 |
-
@spaces.
|
51 |
def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
52 |
-
vq_model.ensure_whisper(
|
53 |
wav, sr = torchaudio.load(audio_path)
|
54 |
if sr != 16000:
|
55 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
@@ -59,35 +58,29 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
|
59 |
|
60 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
61 |
return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
|
62 |
-
# print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
|
63 |
-
# print(tokenizer.eos_token)
|
64 |
|
65 |
-
@spaces.
|
66 |
def text_to_audio_file(text):
|
67 |
-
# gen a random id for the audio file
|
68 |
id = str(uuid.uuid4())
|
69 |
temp_file = f"./user_audio/{id}_temp_audio.wav"
|
70 |
text = text
|
71 |
text_split = "_".join(text.lower().split(" "))
|
72 |
-
# remove the last character if it is a period
|
73 |
if text_split[-1] == ".":
|
74 |
text_split = text_split[:-1]
|
75 |
-
tts = TTSProcessor(
|
76 |
tts.convert_text_to_audio_file(text, temp_file)
|
77 |
-
# logging.info(f"Saving audio to {temp_file}")
|
78 |
-
# torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
|
79 |
print(f"Saved audio to {temp_file}")
|
80 |
return temp_file
|
81 |
|
82 |
|
83 |
-
@spaces.
|
84 |
def process_input(audio_file=None):
|
85 |
|
86 |
for partial_message in process_audio(audio_file):
|
87 |
yield partial_message
|
88 |
|
89 |
|
90 |
-
@spaces.
|
91 |
def process_transcribe_input(audio_file=None):
|
92 |
|
93 |
for partial_message in process_audio(audio_file, transcript=True):
|
@@ -102,7 +95,7 @@ class StopOnTokens(StoppingCriteria):
|
|
102 |
return True
|
103 |
return False
|
104 |
|
105 |
-
@spaces.
|
106 |
def process_audio(audio_file, transcript=False):
|
107 |
if audio_file is None:
|
108 |
raise ValueError("No audio file provided")
|
|
|
14 |
)
|
15 |
import uuid
|
16 |
|
17 |
+
device = "cpu" # Change this to always use CPU
|
|
|
18 |
vq_model = RQBottleneckTransformer.load_model(
|
19 |
"whisper-vq-stoks-medium-en+pl-fixed.model"
|
20 |
).to(device)
|
21 |
+
|
22 |
use_8bit = False
|
23 |
llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
|
24 |
tokenizer = AutoTokenizer.from_pretrained(llm_path)
|
|
|
30 |
llm_int8_has_fp16_weight=False,
|
31 |
)
|
32 |
else:
|
33 |
+
model_kwargs["torch_dtype"] = torch.float32 # Change this to use float32 on CPU
|
34 |
model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
|
35 |
|
36 |
+
@spaces.CPU # Change this to use CPU
|
37 |
def audio_to_sound_tokens_whisperspeech(audio_path):
|
38 |
+
vq_model.ensure_whisper(device) # Change this to use the defined device
|
39 |
wav, sr = torchaudio.load(audio_path)
|
40 |
if sr != 16000:
|
41 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
|
|
46 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
47 |
return f'<|sound_start|>{result}<|sound_end|>'
|
48 |
|
49 |
+
@spaces.CPU # Change this to use CPU
|
50 |
def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
|
51 |
+
vq_model.ensure_whisper(device) # Change this to use the defined device
|
52 |
wav, sr = torchaudio.load(audio_path)
|
53 |
if sr != 16000:
|
54 |
wav = torchaudio.functional.resample(wav, sr, 16000)
|
|
|
58 |
|
59 |
result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
|
60 |
return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
|
|
|
|
|
61 |
|
62 |
+
@spaces.CPU # Change this to use CPU
|
63 |
def text_to_audio_file(text):
|
|
|
64 |
id = str(uuid.uuid4())
|
65 |
temp_file = f"./user_audio/{id}_temp_audio.wav"
|
66 |
text = text
|
67 |
text_split = "_".join(text.lower().split(" "))
|
|
|
68 |
if text_split[-1] == ".":
|
69 |
text_split = text_split[:-1]
|
70 |
+
tts = TTSProcessor(device) # Change this to use the defined device
|
71 |
tts.convert_text_to_audio_file(text, temp_file)
|
|
|
|
|
72 |
print(f"Saved audio to {temp_file}")
|
73 |
return temp_file
|
74 |
|
75 |
|
76 |
+
@spaces.CPU
|
77 |
def process_input(audio_file=None):
|
78 |
|
79 |
for partial_message in process_audio(audio_file):
|
80 |
yield partial_message
|
81 |
|
82 |
|
83 |
+
@spaces.CPU
|
84 |
def process_transcribe_input(audio_file=None):
|
85 |
|
86 |
for partial_message in process_audio(audio_file, transcript=True):
|
|
|
95 |
return True
|
96 |
return False
|
97 |
|
98 |
+
@spaces.CPU
|
99 |
def process_audio(audio_file, transcript=False):
|
100 |
if audio_file is None:
|
101 |
raise ValueError("No audio file provided")
|