Spaces:
Sleeping
Sleeping
Commit
·
f599acb
1
Parent(s):
5dfc9ca
made share true
Browse files
app.py
CHANGED
@@ -121,11 +121,12 @@ def parler_preprocess(text):
|
|
121 |
return text
|
122 |
|
123 |
def text_to_speech(text):
|
124 |
-
# Asian mom nagging style description
|
125 |
description = ("Elisabeth speaks in a mature, strict, nagging, and slightly disappointed tone, "
|
126 |
"with a hint of love and high expectations, at a moderate pace with high quality audio. "
|
127 |
"She sounds like a stereotypical Asian mother who compares you to your cousins, "
|
128 |
"questions your life choices, and threatens you with a slipper, but ultimately wants the best for you.")
|
|
|
|
|
129 |
inputs = parler_tokenizer(description, return_tensors="pt").to(parler_device)
|
130 |
prompt = parler_tokenizer(parler_preprocess(text), return_tensors="pt").to(parler_device)
|
131 |
set_seed(PARLER_SEED)
|
@@ -136,7 +137,10 @@ def text_to_speech(text):
|
|
136 |
def process_frame(image, vision_components, llm_components):
|
137 |
caption = analyze_image(image, vision_components)
|
138 |
roast = generate_roast(caption, llm_components)
|
139 |
-
|
|
|
|
|
|
|
140 |
return caption, roast, audio
|
141 |
|
142 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
@@ -185,4 +189,4 @@ if __name__ == "__main__":
|
|
185 |
os.system('python -m unidic download')
|
186 |
nltk.download('averaged_perceptron_tagger_eng')
|
187 |
app = create_app()
|
188 |
-
app.launch()
|
|
|
121 |
return text
|
122 |
|
123 |
def text_to_speech(text):
|
|
|
124 |
description = ("Elisabeth speaks in a mature, strict, nagging, and slightly disappointed tone, "
|
125 |
"with a hint of love and high expectations, at a moderate pace with high quality audio. "
|
126 |
"She sounds like a stereotypical Asian mother who compares you to your cousins, "
|
127 |
"questions your life choices, and threatens you with a slipper, but ultimately wants the best for you.")
|
128 |
+
if not text or not isinstance(text, str):
|
129 |
+
return (PARLER_SAMPLE_RATE, np.zeros(1))
|
130 |
inputs = parler_tokenizer(description, return_tensors="pt").to(parler_device)
|
131 |
prompt = parler_tokenizer(parler_preprocess(text), return_tensors="pt").to(parler_device)
|
132 |
set_seed(PARLER_SEED)
|
|
|
137 |
def process_frame(image, vision_components, llm_components):
|
138 |
caption = analyze_image(image, vision_components)
|
139 |
roast = generate_roast(caption, llm_components)
|
140 |
+
if not roast or not isinstance(roast, str):
|
141 |
+
audio = (PARLER_SAMPLE_RATE, np.zeros(1))
|
142 |
+
else:
|
143 |
+
audio = text_to_speech(roast)
|
144 |
return caption, roast, audio
|
145 |
|
146 |
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
|
|
189 |
os.system('python -m unidic download')
|
190 |
nltk.download('averaged_perceptron_tagger_eng')
|
191 |
app = create_app()
|
192 |
+
app.launch(share=True)
|