Kuberwastaken commited on
Commit
e94ed44
·
0 Parent(s):
Files changed (3) hide show
  1. README.md +48 -0
  2. app.py +160 -0
  3. requirements.txt +9 -0
README.md ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: AsianMOM
3
+ emoji: 💢
4
+ colorFrom: red
5
+ colorTo: red
6
+ sdk: gradio
7
+ sdk_version: 4.31.2
8
+ app_file: app.py
9
+ pinned: true
10
+ ---
11
+
12
+ # AsianMOM 💢
13
+
14
+ **AsianMOM** is a fun, interactive Gradio Space that uses your webcam to observe what you're doing and then roasts you like a stereotypical Asian mom—complete with high expectations, cousin comparisons, and slipper threats!
15
+
16
+ ## 🚀 Features
17
+ - **Live Webcam Feed**: Observes your actions in real time.
18
+ - **Vision Model**: Describes what it sees using BLIP image captioning.
19
+ - **Roast Generation**: Uses Meta's Llama-3.2-1B-Instruct to generate witty, culturally-inspired "mom roasts".
20
+ - **Text-to-Speech**: Delivers the roast in a mature, motherly voice using Parler-TTS.
21
+ - **Fully Automated**: No button presses needed—just let AsianMOM do her thing!
22
+
23
+ ## 🛠️ How It Works
24
+ 1. **Webcam Capture**: The app streams your webcam feed.
25
+ 2. **Image Captioning**: BLIP model generates a description of what you're doing.
26
+ 3. **Roast Generation**: Llama-3.2-1B-Instruct crafts a humorous, mom-style roast based on the caption.
27
+ 4. **Voice Output**: Parler-TTS reads the roast aloud in a fitting voice.
28
+
29
+ ## 📦 Setup & Usage
30
+ 1. **Clone or Fork this Space**
31
+ 2. Ensure your hardware supports GPU (T4 or better recommended)
32
+ 3. All dependencies are managed via `requirements.txt`
33
+ 4. Launch the Space and allow webcam access
34
+ 5. Enjoy being roasted by AsianMOM!
35
+
36
+ ## 🧩 Models Used
37
+ - [BLIP Image Captioning](https://huggingface.co/Salesforce/blip-image-captioning-base)
38
+ - [Llama-3.2-1B-Instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct)
39
+ - [Parler-TTS Mini Expresso](https://huggingface.co/parler-tts/parler-tts-mini-expresso)
40
+
41
+ ## 🙏 Credits
42
+ - Inspired by classic Asian mom humor and memes
43
+ - Built with [Gradio](https://gradio.app/)
44
+ - Powered by Hugging Face models
45
+
46
+ ## ⚠️ Disclaimer
47
+ This app is for entertainment purposes only. Stereotypes are used in a lighthearted, humorous way—please use responsibly and respectfully.
48
+
app.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import torch
4
+ import cv2
5
+ from PIL import Image
6
+ import numpy as np
7
+ from transformers import pipeline, AutoProcessor, AutoModelForVision2Seq
8
+ from transformers import AutoModelForCausalLM, AutoTokenizer
9
+ import time
10
+
11
+ from transformers import BlipProcessor, BlipForConditionalGeneration
12
+
13
+ # Set environment variables
14
+ os.environ["TOKENIZERS_PARALLELISM"] = "false"
15
+
16
+ def initialize_vision_model():
17
+ # Using BLIP for image captioning - lightweight but effective
18
+ processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
19
+ model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
20
+
21
+ return {
22
+ "processor": processor,
23
+ "model": model
24
+ }
25
+
26
+ def analyze_image(image, vision_components):
27
+ processor = vision_components["processor"]
28
+ model = vision_components["model"]
29
+
30
+ # Convert to RGB if needed
31
+ if isinstance(image, np.ndarray):
32
+ image = Image.fromarray(image)
33
+
34
+ inputs = processor(image, return_tensors="pt")
35
+
36
+ with torch.no_grad():
37
+ outputs = model.generate(**inputs, max_length=30)
38
+
39
+ caption = processor.decode(outputs[0], skip_special_tokens=True)
40
+ return caption
41
+
42
+ def initialize_llm():
43
+ model_id = "meta-llama/Llama-3.2-1B-Instruct"
44
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
45
+ model = AutoModelForCausalLM.from_pretrained(
46
+ model_id,
47
+ torch_dtype=torch.bfloat16,
48
+ device_map="auto"
49
+ )
50
+
51
+ return {
52
+ "model": model,
53
+ "tokenizer": tokenizer
54
+ }
55
+
56
+ def generate_roast(caption, llm_components):
57
+ model = llm_components["model"]
58
+ tokenizer = llm_components["tokenizer"]
59
+
60
+ prompt = f"""[INST] You are AsianMOM, a stereotypical Asian mother who always has high expectations. \nYou just observed your child doing this: \"{caption}\"\n \nRespond with a short, humorous roast (maximum 2-3 sentences) in the style of a stereotypical Asian mother. \nInclude at least one of these elements:\n- Comparison to more successful relatives/cousins\n- High expectations about academic success\n- Mild threats about using slippers\n- Questioning life choices\n- Asking when they'll get married or have kids\n- Commenting on appearance\n- Saying \"back in my day\" and describing hardship\n\nBe funny but not hurtful. Keep it brief. [/INST]"""
61
+
62
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
63
+
64
+ with torch.no_grad():
65
+ outputs = model.generate(
66
+ **inputs,
67
+ max_length=300,
68
+ temperature=0.7,
69
+ top_p=0.9,
70
+ do_sample=True
71
+ )
72
+
73
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
74
+ # Extract just the response part, not the prompt
75
+ response = response.split("[/INST]")[1].strip()
76
+
77
+ return response
78
+
79
+ def initialize_tts_model():
80
+ tts_pipeline = pipeline(
81
+ "text-to-speech",
82
+ model="parler-tts/parler-tts-mini-expresso"
83
+ )
84
+ return tts_pipeline
85
+
86
+ def text_to_speech(text, tts_pipeline):
87
+ # Additional prompt to guide the voice style
88
+ styled_text = f"[[voice:female_mature]] [[speed:0.9]] [[precision:0.8]] {text}"
89
+
90
+ speech = tts_pipeline(styled_text)
91
+ return (speech["sampling_rate"], speech["audio"])
92
+
93
+ def process_frame(image, vision_components, llm_components, tts_pipeline):
94
+ # Step 1: Analyze what's in the image
95
+ caption = analyze_image(image, vision_components)
96
+
97
+ # Step 2: Generate roast based on the caption
98
+ roast = generate_roast(caption, llm_components)
99
+
100
+ # Step 3: Convert roast to speech
101
+ audio = text_to_speech(roast, tts_pipeline)
102
+
103
+ return caption, roast, audio
104
+
105
+ def setup_processing_chain(video_feed, analysis_output, roast_output, audio_output):
106
+ # Initialize all models
107
+ vision_components = initialize_vision_model()
108
+ llm_components = initialize_llm()
109
+ tts_pipeline = initialize_tts_model()
110
+
111
+ last_process_time = time.time() - 10 # Initialize with an offset
112
+ processing_interval = 5 # Process every 5 seconds
113
+
114
+ def process_webcam(image):
115
+ nonlocal last_process_time
116
+
117
+ current_time = time.time()
118
+ if current_time - last_process_time >= processing_interval and image is not None:
119
+ last_process_time = current_time
120
+
121
+ caption, roast, audio = process_frame(
122
+ image,
123
+ vision_components,
124
+ llm_components,
125
+ tts_pipeline
126
+ )
127
+
128
+ return image, caption, roast, audio
129
+
130
+ # Return None for outputs that shouldn't update
131
+ return image, None, None, None
132
+
133
+ video_feed.change(
134
+ process_webcam,
135
+ inputs=[video_feed],
136
+ outputs=[video_feed, analysis_output, roast_output, audio_output]
137
+ )
138
+
139
+ def create_app():
140
+ with gr.Blocks(theme=gr.themes.Soft()) as app:
141
+ gr.Markdown("# AsianMOM: Asian Mother Observer & Mocker")
142
+ gr.Markdown("### Camera captures what you're doing and your Asian mom responds appropriately")
143
+
144
+ with gr.Row():
145
+ with gr.Column():
146
+ video_feed = gr.Image(source="webcam", streaming=True, label="Camera Feed")
147
+
148
+ with gr.Column():
149
+ analysis_output = gr.Textbox(label="What AsianMOM Sees", lines=2)
150
+ roast_output = gr.Textbox(label="AsianMOM's Thoughts", lines=4)
151
+ audio_output = gr.Audio(label="AsianMOM Says", autoplay=True)
152
+
153
+ # Setup the processing chain
154
+ setup_processing_chain(video_feed, analysis_output, roast_output, audio_output)
155
+
156
+ return app
157
+
158
+ if __name__ == "__main__":
159
+ app = create_app()
160
+ app.launch()
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio==4.26.0
2
+ torch==2.1.0
3
+ torchvision
4
+ transformers==4.36.2
5
+ pillow
6
+ numpy
7
+ accelerate
8
+ git+https://github.com/huggingface/diffusers
9
+ opencv-python