Llama-3.2s-1B-Instruct-v0.1

Running

App Files Files Community

QuietImpostor commited on Sep 27, 2024

Commit

cc0fe39

verified ·

1 Parent(s): 0646ffe

Update app.py

Browse files

Files changed (1) hide show

app.py +12 -19

app.py CHANGED Viewed

@@ -14,12 +14,11 @@ from generate_audio import (
 )
 import uuid
-device = "cuda" if torch.cuda.is_available() else "cpu"
 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
-# tts = TTSProcessor('cpu')
 use_8bit = False
 llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(llm_path)
@@ -31,12 +30,12 @@ if use_8bit:
         llm_int8_has_fp16_weight=False,
     )
 else:
-    model_kwargs["torch_dtype"] = torch.bfloat16
 model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
-@spaces.GPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
-    vq_model.ensure_whisper('cuda')
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -47,9 +46,9 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
-@spaces.GPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
-    vq_model.ensure_whisper('cuda')
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -59,35 +58,29 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
-# print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
-# print(tokenizer.eos_token)
-@spaces.GPU
 def text_to_audio_file(text):
-    # gen a random id for the audio file
     id = str(uuid.uuid4())
     temp_file = f"./user_audio/{id}_temp_audio.wav"
     text = text
     text_split = "_".join(text.lower().split(" "))
-    # remove the last character if it is a period
     if text_split[-1] == ".":
         text_split = text_split[:-1]
-    tts = TTSProcessor("cuda")
     tts.convert_text_to_audio_file(text, temp_file)
-    # logging.info(f"Saving audio to {temp_file}")
-    # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
     print(f"Saved audio to {temp_file}")
     return temp_file
-@spaces.GPU
 def process_input(audio_file=None):
     for partial_message in process_audio(audio_file):
         yield partial_message
-@spaces.GPU
 def process_transcribe_input(audio_file=None):
     for partial_message in process_audio(audio_file, transcript=True):
@@ -102,7 +95,7 @@ class StopOnTokens(StoppingCriteria):
                 return True
         return False
-@spaces.GPU
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
             raise ValueError("No audio file provided")

 )
 import uuid
+device = "cpu"  # Change this to always use CPU
 vq_model = RQBottleneckTransformer.load_model(
         "whisper-vq-stoks-medium-en+pl-fixed.model"
     ).to(device)
 use_8bit = False
 llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
 tokenizer = AutoTokenizer.from_pretrained(llm_path)
         llm_int8_has_fp16_weight=False,
     )
 else:
+    model_kwargs["torch_dtype"] = torch.float32  # Change this to use float32 on CPU
 model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
+@spaces.CPU  # Change this to use CPU
 def audio_to_sound_tokens_whisperspeech(audio_path):
+    vq_model.ensure_whisper(device)  # Change this to use the defined device
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|sound_start|>{result}<|sound_end|>'
+@spaces.CPU  # Change this to use CPU
 def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
+    vq_model.ensure_whisper(device)  # Change this to use the defined device
     wav, sr = torchaudio.load(audio_path)
     if sr != 16000:
         wav = torchaudio.functional.resample(wav, sr, 16000)
     result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
     return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
+@spaces.CPU  # Change this to use CPU
 def text_to_audio_file(text):
     id = str(uuid.uuid4())
     temp_file = f"./user_audio/{id}_temp_audio.wav"
     text = text
     text_split = "_".join(text.lower().split(" "))
     if text_split[-1] == ".":
         text_split = text_split[:-1]
+    tts = TTSProcessor(device)  # Change this to use the defined device
     tts.convert_text_to_audio_file(text, temp_file)
     print(f"Saved audio to {temp_file}")
     return temp_file
+@spaces.CPU
 def process_input(audio_file=None):
     for partial_message in process_audio(audio_file):
         yield partial_message
+@spaces.CPU
 def process_transcribe_input(audio_file=None):
     for partial_message in process_audio(audio_file, transcript=True):
                 return True
         return False
+@spaces.CPU
 def process_audio(audio_file, transcript=False):
     if audio_file is None:
             raise ValueError("No audio file provided")