QuietImpostor commited on
Commit
cc0fe39
·
verified ·
1 Parent(s): 0646ffe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +12 -19
app.py CHANGED
@@ -14,12 +14,11 @@ from generate_audio import (
14
  )
15
  import uuid
16
 
17
-
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
  vq_model = RQBottleneckTransformer.load_model(
20
  "whisper-vq-stoks-medium-en+pl-fixed.model"
21
  ).to(device)
22
- # tts = TTSProcessor('cpu')
23
  use_8bit = False
24
  llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
25
  tokenizer = AutoTokenizer.from_pretrained(llm_path)
@@ -31,12 +30,12 @@ if use_8bit:
31
  llm_int8_has_fp16_weight=False,
32
  )
33
  else:
34
- model_kwargs["torch_dtype"] = torch.bfloat16
35
  model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
36
 
37
- @spaces.GPU
38
  def audio_to_sound_tokens_whisperspeech(audio_path):
39
- vq_model.ensure_whisper('cuda')
40
  wav, sr = torchaudio.load(audio_path)
41
  if sr != 16000:
42
  wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -47,9 +46,9 @@ def audio_to_sound_tokens_whisperspeech(audio_path):
47
  result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
48
  return f'<|sound_start|>{result}<|sound_end|>'
49
 
50
- @spaces.GPU
51
  def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
52
- vq_model.ensure_whisper('cuda')
53
  wav, sr = torchaudio.load(audio_path)
54
  if sr != 16000:
55
  wav = torchaudio.functional.resample(wav, sr, 16000)
@@ -59,35 +58,29 @@ def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
59
 
60
  result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
61
  return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
62
- # print(tokenizer.encode("<|sound_0001|>", add_special_tokens=False))# return the audio tensor
63
- # print(tokenizer.eos_token)
64
 
65
- @spaces.GPU
66
  def text_to_audio_file(text):
67
- # gen a random id for the audio file
68
  id = str(uuid.uuid4())
69
  temp_file = f"./user_audio/{id}_temp_audio.wav"
70
  text = text
71
  text_split = "_".join(text.lower().split(" "))
72
- # remove the last character if it is a period
73
  if text_split[-1] == ".":
74
  text_split = text_split[:-1]
75
- tts = TTSProcessor("cuda")
76
  tts.convert_text_to_audio_file(text, temp_file)
77
- # logging.info(f"Saving audio to {temp_file}")
78
- # torchaudio.save(temp_file, audio.cpu(), sample_rate=24000)
79
  print(f"Saved audio to {temp_file}")
80
  return temp_file
81
 
82
 
83
- @spaces.GPU
84
  def process_input(audio_file=None):
85
 
86
  for partial_message in process_audio(audio_file):
87
  yield partial_message
88
 
89
 
90
- @spaces.GPU
91
  def process_transcribe_input(audio_file=None):
92
 
93
  for partial_message in process_audio(audio_file, transcript=True):
@@ -102,7 +95,7 @@ class StopOnTokens(StoppingCriteria):
102
  return True
103
  return False
104
 
105
- @spaces.GPU
106
  def process_audio(audio_file, transcript=False):
107
  if audio_file is None:
108
  raise ValueError("No audio file provided")
 
14
  )
15
  import uuid
16
 
17
+ device = "cpu" # Change this to always use CPU
 
18
  vq_model = RQBottleneckTransformer.load_model(
19
  "whisper-vq-stoks-medium-en+pl-fixed.model"
20
  ).to(device)
21
+
22
  use_8bit = False
23
  llm_path = "QuietImpostor/Llama-3.2s-1B-Instruct-v0.1"
24
  tokenizer = AutoTokenizer.from_pretrained(llm_path)
 
30
  llm_int8_has_fp16_weight=False,
31
  )
32
  else:
33
+ model_kwargs["torch_dtype"] = torch.float32 # Change this to use float32 on CPU
34
  model = AutoModelForCausalLM.from_pretrained(llm_path, **model_kwargs).to(device)
35
 
36
+ @spaces.CPU # Change this to use CPU
37
  def audio_to_sound_tokens_whisperspeech(audio_path):
38
+ vq_model.ensure_whisper(device) # Change this to use the defined device
39
  wav, sr = torchaudio.load(audio_path)
40
  if sr != 16000:
41
  wav = torchaudio.functional.resample(wav, sr, 16000)
 
46
  result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
47
  return f'<|sound_start|>{result}<|sound_end|>'
48
 
49
+ @spaces.CPU # Change this to use CPU
50
  def audio_to_sound_tokens_whisperspeech_transcribe(audio_path):
51
+ vq_model.ensure_whisper(device) # Change this to use the defined device
52
  wav, sr = torchaudio.load(audio_path)
53
  if sr != 16000:
54
  wav = torchaudio.functional.resample(wav, sr, 16000)
 
58
 
59
  result = ''.join(f'<|sound_{num:04d}|>' for num in codes)
60
  return f'<|reserved_special_token_69|><|sound_start|>{result}<|sound_end|>'
 
 
61
 
62
+ @spaces.CPU # Change this to use CPU
63
  def text_to_audio_file(text):
 
64
  id = str(uuid.uuid4())
65
  temp_file = f"./user_audio/{id}_temp_audio.wav"
66
  text = text
67
  text_split = "_".join(text.lower().split(" "))
 
68
  if text_split[-1] == ".":
69
  text_split = text_split[:-1]
70
+ tts = TTSProcessor(device) # Change this to use the defined device
71
  tts.convert_text_to_audio_file(text, temp_file)
 
 
72
  print(f"Saved audio to {temp_file}")
73
  return temp_file
74
 
75
 
76
+ @spaces.CPU
77
  def process_input(audio_file=None):
78
 
79
  for partial_message in process_audio(audio_file):
80
  yield partial_message
81
 
82
 
83
+ @spaces.CPU
84
  def process_transcribe_input(audio_file=None):
85
 
86
  for partial_message in process_audio(audio_file, transcript=True):
 
95
  return True
96
  return False
97
 
98
+ @spaces.CPU
99
  def process_audio(audio_file, transcript=False):
100
  if audio_file is None:
101
  raise ValueError("No audio file provided")