Spaces:
Running
Running
Commit
·
e94ed44
0
Parent(s):
Init
Browse files- README.md +48 -0
- app.py +160 -0
- requirements.txt +9 -0
README.md
ADDED
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: AsianMOM
|
3 |
+
emoji: 💢
|
4 |
+
colorFrom: red
|
5 |
+
colorTo: red
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 4.31.2
|
8 |
+
app_file: app.py
|
9 |
+
pinned: true
|
10 |
+
---
|
11 |
+
|
12 |
+
# AsianMOM 💢
|
13 |
+
|
14 |
+
**AsianMOM** is a fun, interactive Gradio Space that uses your webcam to observe what you're doing and then roasts you like a stereotypical Asian mom—complete with high expectations, cousin comparisons, and slipper threats!
|
15 |
+
|
16 |
+
## 🚀 Features
|
17 |
+
- **Live Webcam Feed**: Observes your actions in real time.
|
18 |
+
- **Vision Model**: Describes what it sees using BLIP image captioning.
|
19 |
+
- **Roast Generation**: Uses Meta's Llama-3.2-1B-Instruct to generate witty, culturally-inspired "mom roasts".
|
20 |
+
- **Text-to-Speech**: Delivers the roast in a mature, motherly voice using Parler-TTS.
|
21 |
+
- **Fully Automated**: No button presses needed—just let AsianMOM do her thing!
|
22 |
+
|
23 |
+
## 🛠️ How It Works
|
24 |
+
1. **Webcam Capture**: The app streams your webcam feed.
|
25 |
+
2. **Image Captioning**: BLIP model generates a description of what you're doing.
|
26 |
+
3. **Roast Generation**: Llama-3.2-1B-Instruct crafts a humorous, mom-style roast based on the caption.
|
27 |
+
4. **Voice Output**: Parler-TTS reads the roast aloud in a fitting voice.
|
28 |
+
|
29 |
+
## 📦 Setup & Usage
|
30 |
+
1. **Clone or Fork this Space**
|
31 |
+
2. Ensure your hardware supports GPU (T4 or better recommended)
|
32 |
+
3. All dependencies are managed via `requirements.txt`
|
33 |
+
4. Launch the Space and allow webcam access
|
34 |
+
5. Enjoy being roasted by AsianMOM!
|
35 |
+
|
36 |
+
## 🧩 Models Used
|
37 |
+
- [BLIP Image Captioning](https://huggingface.co/Salesforce/blip-image-captioning-base)
|
38 |
+
- [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
|
39 |
+
- [Parler-TTS Mini Expresso](https://huggingface.co/parler-tts/parler-tts-mini-expresso)
|
40 |
+
|
41 |
+
## 🙏 Credits
|
42 |
+
- Inspired by classic Asian mom humor and memes
|
43 |
+
- Built with [Gradio](https://gradio.app/)
|
44 |
+
- Powered by Hugging Face models
|
45 |
+
|
46 |
+
## ⚠️ Disclaimer
|
47 |
+
This app is for entertainment purposes only. Stereotypes are used in a lighthearted, humorous way—please use responsibly and respectfully.
|
48 |
+
|
app.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import gradio as gr
|
3 |
+
import torch
|
4 |
+
import cv2
|
5 |
+
from PIL import Image
|
6 |
+
import numpy as np
|
7 |
+
from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
|
8 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
9 |
+
import time
|
10 |
+
|
11 |
+
from transformers import BlipProcessor, BlipForConditionalGeneration
|
12 |
+
|
13 |
+
# Set environment variables
|
14 |
+
os.environ["TOKENIZERS_PARALLELISM"] = "false"
|
15 |
+
|
16 |
+
def initialize_vision_model():
|
17 |
+
# Using BLIP for image captioning - lightweight but effective
|
18 |
+
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
|
19 |
+
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
|
20 |
+
|
21 |
+
return {
|
22 |
+
"processor": processor,
|
23 |
+
"model": model
|
24 |
+
}
|
25 |
+
|
26 |
+
def analyze_image(image, vision_components):
|
27 |
+
processor = vision_components["processor"]
|
28 |
+
model = vision_components["model"]
|
29 |
+
|
30 |
+
# Convert to RGB if needed
|
31 |
+
if isinstance(image, np.ndarray):
|
32 |
+
image = Image.fromarray(image)
|
33 |
+
|
34 |
+
inputs = processor(image, return_tensors="pt")
|
35 |
+
|
36 |
+
with torch.no_grad():
|
37 |
+
outputs = model.generate(**inputs, max_length=30)
|
38 |
+
|
39 |
+
caption = processor.decode(outputs[0], skip_special_tokens=True)
|
40 |
+
return caption
|
41 |
+
|
42 |
+
def initialize_llm():
|
43 |
+
model_id = "meta-llama/Llama-3.2-1B-Instruct"
|
44 |
+
tokenizer = AutoTokenizer.from_pretrained(model_id)
|
45 |
+
model = AutoModelForCausalLM.from_pretrained(
|
46 |
+
model_id,
|
47 |
+
torch_dtype=torch.bfloat16,
|
48 |
+
device_map="auto"
|
49 |
+
)
|
50 |
+
|
51 |
+
return {
|
52 |
+
"model": model,
|
53 |
+
"tokenizer": tokenizer
|
54 |
+
}
|
55 |
+
|
56 |
+
def generate_roast(caption, llm_components):
|
57 |
+
model = llm_components["model"]
|
58 |
+
tokenizer = llm_components["tokenizer"]
|
59 |
+
|
60 |
+
prompt = f"""[INST] You are AsianMOM, a stereotypical Asian mother who always has high expectations. \nYou just observed your child doing this: \"{caption}\"\n \nRespond with a short, humorous roast (maximum 2-3 sentences) in the style of a stereotypical Asian mother. \nInclude at least one of these elements:\n- Comparison to more successful relatives/cousins\n- High expectations about academic success\n- Mild threats about using slippers\n- Questioning life choices\n- Asking when they'll get married or have kids\n- Commenting on appearance\n- Saying \"back in my day\" and describing hardship\n\nBe funny but not hurtful. Keep it brief. [/INST]"""
|
61 |
+
|
62 |
+
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
|
63 |
+
|
64 |
+
with torch.no_grad():
|
65 |
+
outputs = model.generate(
|
66 |
+
**inputs,
|
67 |
+
max_length=300,
|
68 |
+
temperature=0.7,
|
69 |
+
top_p=0.9,
|
70 |
+
do_sample=True
|
71 |
+
)
|
72 |
+
|
73 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
74 |
+
# Extract just the response part, not the prompt
|
75 |
+
response = response.split("[/INST]")[1].strip()
|
76 |
+
|
77 |
+
return response
|
78 |
+
|
79 |
+
def initialize_tts_model():
|
80 |
+
tts_pipeline = pipeline(
|
81 |
+
"text-to-speech",
|
82 |
+
model="parler-tts/parler-tts-mini-expresso"
|
83 |
+
)
|
84 |
+
return tts_pipeline
|
85 |
+
|
86 |
+
def text_to_speech(text, tts_pipeline):
|
87 |
+
# Additional prompt to guide the voice style
|
88 |
+
styled_text = f"[[voice:female_mature]] [[speed:0.9]] [[precision:0.8]] {text}"
|
89 |
+
|
90 |
+
speech = tts_pipeline(styled_text)
|
91 |
+
return (speech["sampling_rate"], speech["audio"])
|
92 |
+
|
93 |
+
def process_frame(image, vision_components, llm_components, tts_pipeline):
|
94 |
+
# Step 1: Analyze what's in the image
|
95 |
+
caption = analyze_image(image, vision_components)
|
96 |
+
|
97 |
+
# Step 2: Generate roast based on the caption
|
98 |
+
roast = generate_roast(caption, llm_components)
|
99 |
+
|
100 |
+
# Step 3: Convert roast to speech
|
101 |
+
audio = text_to_speech(roast, tts_pipeline)
|
102 |
+
|
103 |
+
return caption, roast, audio
|
104 |
+
|
105 |
+
def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
|
106 |
+
# Initialize all models
|
107 |
+
vision_components = initialize_vision_model()
|
108 |
+
llm_components = initialize_llm()
|
109 |
+
tts_pipeline = initialize_tts_model()
|
110 |
+
|
111 |
+
last_process_time = time.time() - 10 # Initialize with an offset
|
112 |
+
processing_interval = 5 # Process every 5 seconds
|
113 |
+
|
114 |
+
def process_webcam(image):
|
115 |
+
nonlocal last_process_time
|
116 |
+
|
117 |
+
current_time = time.time()
|
118 |
+
if current_time - last_process_time >= processing_interval and image is not None:
|
119 |
+
last_process_time = current_time
|
120 |
+
|
121 |
+
caption, roast, audio = process_frame(
|
122 |
+
image,
|
123 |
+
vision_components,
|
124 |
+
llm_components,
|
125 |
+
tts_pipeline
|
126 |
+
)
|
127 |
+
|
128 |
+
return image, caption, roast, audio
|
129 |
+
|
130 |
+
# Return None for outputs that shouldn't update
|
131 |
+
return image, None, None, None
|
132 |
+
|
133 |
+
video_feed.change(
|
134 |
+
process_webcam,
|
135 |
+
inputs=[video_feed],
|
136 |
+
outputs=[video_feed, analysis_output, roast_output, audio_output]
|
137 |
+
)
|
138 |
+
|
139 |
+
def create_app():
|
140 |
+
with gr.Blocks(theme=gr.themes.Soft()) as app:
|
141 |
+
gr.Markdown("# AsianMOM: Asian Mother Observer & Mocker")
|
142 |
+
gr.Markdown("### Camera captures what you're doing and your Asian mom responds appropriately")
|
143 |
+
|
144 |
+
with gr.Row():
|
145 |
+
with gr.Column():
|
146 |
+
video_feed = gr.Image(source="webcam", streaming=True, label="Camera Feed")
|
147 |
+
|
148 |
+
with gr.Column():
|
149 |
+
analysis_output = gr.Textbox(label="What AsianMOM Sees", lines=2)
|
150 |
+
roast_output = gr.Textbox(label="AsianMOM's Thoughts", lines=4)
|
151 |
+
audio_output = gr.Audio(label="AsianMOM Says", autoplay=True)
|
152 |
+
|
153 |
+
# Setup the processing chain
|
154 |
+
setup_processing_chain(video_feed, analysis_output, roast_output, audio_output)
|
155 |
+
|
156 |
+
return app
|
157 |
+
|
158 |
+
if __name__ == "__main__":
|
159 |
+
app = create_app()
|
160 |
+
app.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
gradio==4.26.0
|
2 |
+
torch==2.1.0
|
3 |
+
torchvision
|
4 |
+
transformers==4.36.2
|
5 |
+
pillow
|
6 |
+
numpy
|
7 |
+
accelerate
|
8 |
+
git+https://github.com/huggingface/diffusers
|
9 |
+
opencv-python
|