Spaces:
Running
Running
Commit
·
bba8dbb
1
Parent(s):
f599acb
Fixed bool iterable error
Browse files
app.py
CHANGED
@@ -33,18 +33,16 @@ def initialize_vision_model():
|
|
33 |
def analyze_image(image, vision_components):
|
34 |
processor = vision_components["processor"]
|
35 |
model = vision_components["model"]
|
36 |
-
|
37 |
-
# Convert to RGB if needed
|
38 |
if isinstance(image, np.ndarray):
|
39 |
image = Image.fromarray(image)
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
|
49 |
def initialize_llm():
|
50 |
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
@@ -76,25 +74,22 @@ def initialize_llm():
|
|
76 |
def generate_roast(caption, llm_components):
|
77 |
model = llm_components["model"]
|
78 |
tokenizer = llm_components["tokenizer"]
|
79 |
-
|
80 |
prompt = f"""[INST] You are AsianMOM, a stereotypical Asian mother who always has high expectations. \nYou just observed your child doing this: \"{caption}\"\n \nRespond with a short, humorous roast (maximum 2-3 sentences) in the style of a stereotypical Asian mother. \nInclude at least one of these elements:\n- Comparison to more successful relatives/cousins\n- High expectations about academic success\n- Mild threats about using slippers\n- Questioning life choices\n- Asking when they'll get married or have kids\n- Commenting on appearance\n- Saying \"back in my day\" and describing hardship\n\nBe funny but not hurtful. Keep it brief. [/INST]"""
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
)
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
return response
|
98 |
|
99 |
# Parler-TTS setup
|
100 |
parler_device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
@@ -151,6 +146,9 @@ def setup_processing_chain(video_feed, analysis_output, roast_output, audio_outp
|
|
151 |
def process_webcam(image):
|
152 |
nonlocal last_process_time
|
153 |
current_time = time.time()
|
|
|
|
|
|
|
154 |
if current_time - last_process_time >= processing_interval and image is not None:
|
155 |
last_process_time = current_time
|
156 |
caption, roast, audio = process_frame(
|
@@ -158,8 +156,11 @@ def setup_processing_chain(video_feed, analysis_output, roast_output, audio_outp
|
|
158 |
vision_components,
|
159 |
llm_components
|
160 |
)
|
161 |
-
|
162 |
-
|
|
|
|
|
|
|
163 |
video_feed.change(
|
164 |
process_webcam,
|
165 |
inputs=[video_feed],
|
|
|
33 |
def analyze_image(image, vision_components):
|
34 |
processor = vision_components["processor"]
|
35 |
model = vision_components["model"]
|
|
|
|
|
36 |
if isinstance(image, np.ndarray):
|
37 |
image = Image.fromarray(image)
|
38 |
+
try:
|
39 |
+
inputs = processor(image, return_tensors="pt")
|
40 |
+
with torch.no_grad():
|
41 |
+
outputs = model.generate(**inputs, max_length=30)
|
42 |
+
caption = processor.decode(outputs[0], skip_special_tokens=True)
|
43 |
+
return caption if isinstance(caption, str) else ""
|
44 |
+
except Exception:
|
45 |
+
return "" # Return empty string on error
|
46 |
|
47 |
def initialize_llm():
|
48 |
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
|
|
74 |
def generate_roast(caption, llm_components):
|
75 |
model = llm_components["model"]
|
76 |
tokenizer = llm_components["tokenizer"]
|
|
|
77 |
prompt = f"""[INST] You are AsianMOM, a stereotypical Asian mother who always has high expectations. \nYou just observed your child doing this: \"{caption}\"\n \nRespond with a short, humorous roast (maximum 2-3 sentences) in the style of a stereotypical Asian mother. \nInclude at least one of these elements:\n- Comparison to more successful relatives/cousins\n- High expectations about academic success\n- Mild threats about using slippers\n- Questioning life choices\n- Asking when they'll get married or have kids\n- Commenting on appearance\n- Saying \"back in my day\" and describing hardship\n\nBe funny but not hurtful. Keep it brief. [/INST]"""
|
78 |
+
try:
|
79 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
80 |
+
with torch.no_grad():
|
81 |
+
outputs = model.generate(
|
82 |
+
**inputs,
|
83 |
+
max_length=300,
|
84 |
+
temperature=0.7,
|
85 |
+
top_p=0.9,
|
86 |
+
do_sample=True
|
87 |
+
)
|
88 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
89 |
+
response = response.split("[/INST]")[1].strip()
|
90 |
+
return response if isinstance(response, str) else ""
|
91 |
+
except Exception:
|
92 |
+
return "" # Return empty string on error
|
|
|
|
|
93 |
|
94 |
# Parler-TTS setup
|
95 |
parler_device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
|
|
146 |
def process_webcam(image):
|
147 |
nonlocal last_process_time
|
148 |
current_time = time.time()
|
149 |
+
default_caption = ""
|
150 |
+
default_roast = ""
|
151 |
+
default_audio = (PARLER_SAMPLE_RATE, np.zeros(1))
|
152 |
if current_time - last_process_time >= processing_interval and image is not None:
|
153 |
last_process_time = current_time
|
154 |
caption, roast, audio = process_frame(
|
|
|
156 |
vision_components,
|
157 |
llm_components
|
158 |
)
|
159 |
+
final_caption = caption if isinstance(caption, str) else default_caption
|
160 |
+
final_roast = roast if isinstance(roast, str) else default_roast
|
161 |
+
final_audio = audio if isinstance(audio, tuple) and len(audio) == 2 and isinstance(audio[1], np.ndarray) else default_audio
|
162 |
+
return image, final_caption, final_roast, final_audio
|
163 |
+
return image, default_caption, default_roast, default_audio
|
164 |
video_feed.change(
|
165 |
process_webcam,
|
166 |
inputs=[video_feed],
|