jing-bi commited on
Commit
c564b63
·
0 Parent(s):

Fresh deploy: all latest files

Browse files
Files changed (11) hide show
  1. .gitignore +194 -0
  2. LICENSE +21 -0
  3. README.md +93 -0
  4. app/agent.py +27 -0
  5. app/config.py +40 -0
  6. app/memory.py +278 -0
  7. app/tool.py +184 -0
  8. app/utils.py +72 -0
  9. main.py +183 -0
  10. requirements.txt +7 -0
  11. styles.css +391 -0
.gitignore ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ archive/
6
+ # C extensions
7
+ *.so
8
+ test*
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py,cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+
110
+ # pdm
111
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
112
+ #pdm.lock
113
+ # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
114
+ # in version control.
115
+ # https://pdm.fming.dev/latest/usage/project/#working-with-version-control
116
+ .pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
121
+ __pypackages__/
122
+
123
+ # Celery stuff
124
+ celerybeat-schedule
125
+ celerybeat.pid
126
+
127
+ # SageMath parsed files
128
+ *.sage.py
129
+
130
+ # Environments
131
+ .env
132
+ .venv
133
+ env/
134
+ venv/
135
+ ENV/
136
+ env.bak/
137
+ venv.bak/
138
+
139
+ # Spyder project settings
140
+ .spyderproject
141
+ .spyproject
142
+
143
+ # Rope project settings
144
+ .ropeproject
145
+
146
+ # mkdocs documentation
147
+ /site
148
+
149
+ # mypy
150
+ .mypy_cache/
151
+ .dmypy.json
152
+ dmypy.json
153
+
154
+ # Pyre type checker
155
+ .pyre/
156
+
157
+ # pytype static type analyzer
158
+ .pytype/
159
+
160
+ # Cython debug symbols
161
+ cython_debug/
162
+
163
+ # PyCharm
164
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
165
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
166
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
167
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
168
+ #.idea/
169
+
170
+ # Abstra
171
+ # Abstra is an AI-powered process automation framework.
172
+ # Ignore directories containing user credentials, local state, and settings.
173
+ # Learn more at https://abstra.io/docs
174
+ .abstra/
175
+
176
+ # Visual Studio Code
177
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
178
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
179
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
180
+ # you could uncomment the following to ignore the enitre vscode folder
181
+ # .vscode/
182
+
183
+ # Ruff stuff:
184
+ .ruff_cache/
185
+
186
+ # PyPI configuration file
187
+ .pypirc
188
+
189
+ # Cursor
190
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
191
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
192
+ # refer to https://docs.cursor.com/context/ignore-files
193
+ .cursorignore
194
+ .cursorindexingignore
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Jing Bi
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Perceptual Copilot
3
+ emoji: 👁️
4
+ colorFrom: yellow
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 5.33.1
8
+ app_file: main.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ ## ✨ What is Perceptual Copilot?
14
+
15
+ Perceptual Copilot is a prototype that demonstrates the integration of OpenAI agents with visual tools to process real-time video streams. This experimental platform showcases both the promising potential and current limitations of equipping agents with vision capabilities to understand and interact with live visual data.
16
+
17
+
18
+ ### Architecture Overview
19
+
20
+
21
+
22
+ ```
23
+ ┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
24
+ │ Webcam │───▶│ Memory │◀──▶│ Gradio │
25
+ └─────────────────┘ └─────────────────┘ └─────────────────┘
26
+
27
+
28
+ ┌─────────────────┐ ┌─────────────────┐
29
+ │ Agent │◀──▶│ Tools │
30
+ └─────────────────┘ └─────────────────┘
31
+ ```
32
+
33
+ ### Available Tools
34
+
35
+ | Tool | Description | Output |
36
+ |------|-------------|---------|
37
+ | `caption` | Generate detailed image descriptions | Rich visual descriptions |
38
+ | `ocr` | Extract text from images | Extracted text content |
39
+ | `localize` | Detect and locate objects | Bounding boxes with labels |
40
+ | `qa` | Answer questions about images | Contextual answers |
41
+ | `time` | Get current timestamp | Current date and time |
42
+ | _More tools coming soon..._ | Additional capabilities in development | Various outputs |
43
+
44
+ ## 🚀 Quick Start
45
+
46
+ ### Prerequisites
47
+
48
+ - Webcam access
49
+
50
+ ### Installation
51
+
52
+ 1. **Install dependencies**
53
+ ```bash
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ 2. **Set up environment variables**
58
+ ```bash
59
+ export HF_TOKEN="your_huggingface_token"
60
+ export API_KEY="your_openai_api_key"
61
+ export END_LANG="your_llm_endpoint"
62
+ export END_TASK="your_task_endpoint"
63
+ export MODEL_AGENT="your_agent_model"
64
+ export MODEL_MLLM="your_multimodal_model"
65
+ export MODEL_LOC="your_localization_model"
66
+ ```
67
+
68
+ 3. **Launch the application**
69
+ ```bash
70
+ python main.py
71
+ ```
72
+
73
+ ## 💡 Usage Examples
74
+
75
+ ### Basic Interaction
76
+ - **User**: "What do you see?"
77
+ - **Assistant**: *Generates detailed caption of current view*
78
+
79
+ ### OCR Functionality
80
+ - **User**: "Read the text in this document"
81
+ - **Assistant**: *Extracts and returns all visible text*
82
+
83
+ ### Object Detection
84
+ - **User**: "What objects are in front of me?"
85
+ - **Assistant**: *Identifies and localizes objects with bounding boxes*
86
+
87
+
88
+ ## Acknowledgments
89
+
90
+ - Built with [Gradio](https://gradio.app/) for the interactive web interface
91
+ - Uses [Supervision](https://supervision.roboflow.com/) for frame annotation
92
+ - WebRTC integration via [FastRTC](https://github.com/gradio-app/gradio)
93
+
app/agent.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from agents import Agent
3
+ from app.memory import Memory
4
+ from openai import AsyncOpenAI
5
+ from app.config import env
6
+ from agents import set_default_openai_client, set_default_openai_api, set_tracing_disabled
7
+ from app.tool import caption, ocr, localize, qa, time, video_caption, video_qa
8
+
9
+ def build_agent():
10
+ client = AsyncOpenAI(base_url=env.end_lang,api_key=env.api_key)
11
+ set_default_openai_client(client=client, use_for_tracing=False)
12
+ set_default_openai_api("chat_completions")
13
+ set_tracing_disabled(disabled=True)
14
+ chat_agent = Agent[Memory](
15
+ name="Assistant",
16
+ tools=[caption, ocr, qa, time, localize, video_caption, video_qa],
17
+ model=env.model_agent,
18
+ instructions=(
19
+ "As a helpful assistant, your functions include answering questions about images, "
20
+ "Optical Character Recognition (OCR), image caption generation, object localization "
21
+ "within images, and video caption generation and Q&A. For video-related tools, you "
22
+ "will need to determine the appropriate time window to analyze from the past."
23
+ ),
24
+ )
25
+
26
+ return chat_agent
27
+
app/config.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ from openai import OpenAI
4
+
5
+
6
+
7
+
8
+ try:
9
+ from dotenv import load_dotenv
10
+ load_dotenv()
11
+ except ImportError:
12
+ pass
13
+
14
+
15
+ class Envs:
16
+ def __init__(self):
17
+ self.hf_token = os.getenv("HF_TOKEN")
18
+ self.api_key = os.getenv("API_KEY")
19
+ self.end_task = os.getenv("END_TASK")
20
+ self.end_lang = os.getenv("END_LANG")
21
+ self.model_agent = os.getenv("MODEL_AGENT")
22
+ self.model_mllm = os.getenv("MODEL_MLLM")
23
+ self.model_loc = os.getenv("MODEL_LOC")
24
+
25
+ # Only initialize OpenAI client if we have the required env vars
26
+ if self.end_lang and self.api_key:
27
+ self.client = OpenAI(base_url=self.end_lang, api_key=self.api_key)
28
+ else:
29
+ self.client = None
30
+ print("WARNING: OpenAI client not initialized due to missing environment variables")
31
+
32
+ self.debug = os.getenv("DEBUG", "1").lower() in ("true", "1", "yes")
33
+ self.fps = int(os.getenv("FPS"))
34
+
35
+
36
+ env = Envs()
37
+
38
+ logger = logging.getLogger('copilot')
39
+ logger.setLevel(logging.DEBUG if env.debug else logging.INFO)
40
+ logger.addHandler(logging.StreamHandler())
app/memory.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ from dataclasses import dataclass, field
3
+ from agents import Runner, RunHooks
4
+ import threading
5
+ from typing import Any, Dict, Optional, List
6
+ import traceback
7
+ import time
8
+ from datetime import datetime
9
+ import numpy as np
10
+ import gradio as gr
11
+
12
+ from .config import logger, env
13
+ @dataclass
14
+ class RunnerStep:
15
+ """Log entry for a single Runner step"""
16
+ timestamp: str
17
+ step_type: str
18
+ agent_name: str
19
+ turn_number: int
20
+ details: Dict[str, Any] = field(default_factory=dict)
21
+ duration_ms: Optional[float] = None
22
+
23
+ def __str__(self) -> str:
24
+ return f"[{self.timestamp}][T{self.turn_number}][{self.step_type}]: {self.details}"
25
+
26
+ @dataclass
27
+ class Message:
28
+ role: str
29
+ content: str
30
+ mode: str
31
+ metadata: Dict[str, Any] = field(default_factory=dict)
32
+
33
+ @classmethod
34
+ def user(cls, content: str) -> "Message":
35
+ return cls("user", content, '')
36
+
37
+ @classmethod
38
+ def system(cls, content: str) -> "Message":
39
+ return cls("system", content, '')
40
+
41
+ @classmethod
42
+ def tool(cls, content: str, **kwargs) -> "Message":
43
+ return cls("assistant", content, 'tool', kwargs)
44
+
45
+ @classmethod
46
+ def assistant(cls, content: str, mode='') -> "Message":
47
+ return cls("assistant", content, mode)
48
+
49
+ @classmethod
50
+ def tts(cls, content: str) -> "Message":
51
+ return cls("assistant", content, 'tts')
52
+
53
+ def to_dict(self) -> Dict[str, Any]:
54
+ result = {"role": self.role, "content": self.content}
55
+ if self.mode == "tool":
56
+ metadata = self.metadata.copy()
57
+ if title := metadata.get("title"):
58
+ metadata["title"] = title.title()
59
+ result["metadata"] = metadata
60
+ return result
61
+
62
+
63
+ @dataclass
64
+ class Snapshot:
65
+ sender: str
66
+ data: Any
67
+
68
+ @property
69
+ def gr(self):
70
+ if isinstance(self.data, np.ndarray):
71
+ return gr.Image(self.data)
72
+ return self.data
73
+
74
+
75
+ class RunnerLoggerHooks(RunHooks):
76
+ """Custom hooks to log every step of the Runner"""
77
+
78
+ def __init__(self, memory_instance):
79
+ super().__init__()
80
+ self.memory = memory_instance
81
+ self.current_turn = 0
82
+ self.turn_start_time = None
83
+
84
+ async def on_agent_start(self, context, agent):
85
+ self.current_turn += 1
86
+ self.turn_start_time = time.time()
87
+
88
+ step = RunnerStep(
89
+ timestamp=datetime.now().isoformat(),
90
+ step_type="turn_start",
91
+ agent_name=agent.name,
92
+ turn_number=self.current_turn,
93
+ details={"message": f"Starting turn {self.current_turn} with agent {agent.name}"}
94
+ )
95
+ self.memory.log_runner_step(step)
96
+
97
+ async def on_agent_end(self, context, agent, result):
98
+ if self.turn_start_time:
99
+ duration = (time.time() - self.turn_start_time) * 1000
100
+ else:
101
+ duration = None
102
+
103
+ step = RunnerStep(
104
+ timestamp=datetime.now().isoformat(),
105
+ step_type="agent_call",
106
+ agent_name=agent.name,
107
+ turn_number=self.current_turn,
108
+ details={"message": f"Agent {agent.name} completed", "result_type": type(result).__name__},
109
+ duration_ms=duration
110
+ )
111
+ self.memory.log_runner_step(step)
112
+
113
+ async def on_tool_start(self, context, agent, tool_call):
114
+ tool_name = getattr(tool_call, 'name', 'unknown')
115
+ tool_args = None
116
+ for attr in ['arguments', 'args', 'function', 'parameters']:
117
+ if hasattr(tool_call, attr):
118
+ tool_args = getattr(tool_call, attr)
119
+ break
120
+ step = RunnerStep(
121
+ timestamp=datetime.now().isoformat(),
122
+ step_type="tool_call",
123
+ agent_name=agent.name,
124
+ turn_number=self.current_turn,
125
+ details={
126
+ "tool_name": tool_name,
127
+ "tool_args": tool_args,
128
+ "message": f"Calling tool {tool_name}"
129
+ }
130
+ )
131
+ self.memory.log_runner_step(step)
132
+
133
+ async def on_tool_end(self, context, agent, tool_call, result):
134
+ # Handle different tool_call object attributes safely
135
+ tool_name = getattr(tool_call, 'name', 'unknown')
136
+
137
+ step = RunnerStep(
138
+ timestamp=datetime.now().isoformat(),
139
+ step_type="tool_result",
140
+ agent_name=agent.name,
141
+ turn_number=self.current_turn,
142
+ details={
143
+ "tool_name": tool_name,
144
+ "result_length": len(str(result)) if result else 0,
145
+ "message": f"Tool {tool_name} completed"
146
+ }
147
+ )
148
+ self.memory.log_runner_step(step)
149
+
150
+
151
+ class Chat:
152
+ def __init__(self):
153
+ self.history = []
154
+
155
+ def append(self, message: Message):
156
+ self.history.append(message)
157
+
158
+ @property
159
+ def messages(self):
160
+ return [i.to_dict() for i in self.history]
161
+
162
+
163
+ class Memory:
164
+ def __init__(self, agent, limit: int = 200) -> None:
165
+ self.limit: int = limit
166
+ self.frames: list[Any] = []
167
+ self.snapshots: list[Any] = []
168
+ self.inputs: list[Any] = []
169
+ self.chat = Chat()
170
+
171
+ self.runner_steps: List[RunnerStep] = []
172
+ self.step_limit: int = 1000 # Keep last 1000 steps
173
+ self.logger_hooks: Optional[RunnerLoggerHooks] = None
174
+
175
+ self._chat_q: asyncio.Queue[Any] = asyncio.Queue()
176
+ self._input_q: asyncio.Queue[Any] = asyncio.Queue()
177
+ self._loop: Optional[asyncio.AbstractEventLoop] = None
178
+ self.is_waiting: bool = False
179
+ self.is_running: bool = False
180
+ self._last_frame_time: float = 0
181
+ self.setup(agent)
182
+
183
+ def log_runner_step(self, step: RunnerStep) -> None:
184
+ """Log a runner step and maintain the step history limit"""
185
+ self.runner_steps.append(step)
186
+ logger.debug(f"[ 🛠️ ]{step}")
187
+ while len(self.runner_steps) > self.step_limit:
188
+ self.runner_steps.pop(0)
189
+
190
+ def enqueue(self, data: Any) -> None:
191
+ current_time = time.time()
192
+ if current_time-self._last_frame_time > 1.0 / env.fps:
193
+ self._last_frame_time = current_time
194
+ self.frames.append(data)
195
+ while len(self.frames) > self.limit:
196
+ self.frames.pop(0)
197
+ return self.snapshots.pop(0) if self.snapshots else None
198
+
199
+ def receive(self, text: str) -> None:
200
+ self.chat.append(Message.user(text))
201
+ self._loop.call_soon_threadsafe(self._chat_q.put_nowait, text)
202
+
203
+
204
+ def setup(self, agent) -> None:
205
+ """Bind *agent* and spawn the background monitor threads."""
206
+ self.v_agent = agent
207
+ self.logger_hooks = RunnerLoggerHooks(self)
208
+ def _runner() -> None:
209
+ self._loop = asyncio.new_event_loop()
210
+ asyncio.set_event_loop(self._loop)
211
+ try:
212
+ self._loop.create_task(self._monitor_chat())
213
+ self._loop.run_forever()
214
+ finally:
215
+ self._loop.close()
216
+
217
+ threading.Thread(target=_runner, daemon=True).start()
218
+
219
+ async def _monitor_chat(self) -> None:
220
+ """Process incoming chat messages, respecting the waiting gate."""
221
+ while True:
222
+ text = await self._chat_q.get()
223
+ logger.debug(f"Processing: {text}")
224
+ start_step = RunnerStep(
225
+ timestamp=datetime.now().isoformat(),
226
+ step_type="processing_start",
227
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
228
+ turn_number=0,
229
+ details={"user_input": text}
230
+ )
231
+ self.log_runner_step(start_step)
232
+
233
+ try:
234
+ self.is_running = True
235
+ result = await Runner.run(
236
+ starting_agent=self.v_agent,
237
+ input=text,
238
+ context=self,
239
+ hooks=self.logger_hooks # Add our custom hooks here
240
+ )
241
+
242
+ self.is_running = False
243
+
244
+ # Log successful completion
245
+ success_step = RunnerStep(
246
+ timestamp=datetime.now().isoformat(),
247
+ step_type="final_output",
248
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
249
+ turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
250
+ details={
251
+ "output_type": type(result.final_output).__name__,
252
+ "output_preview": str(result.final_output)[:100] + "..." if len(str(result.final_output)) > 100 else str(result.final_output)
253
+ }
254
+ )
255
+ self.log_runner_step(success_step)
256
+
257
+ except Exception as exc: # noqa: BLE001
258
+ self.is_running = False
259
+ full_traceback = traceback.format_exc()
260
+ logger.debug(f"Error in _monitor_chat: {exc}\n{full_traceback}")
261
+
262
+ # Log the error
263
+ error_step = RunnerStep(
264
+ timestamp=datetime.now().isoformat(),
265
+ step_type="error",
266
+ agent_name=getattr(self.v_agent, 'name', 'unknown'),
267
+ turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
268
+ details={
269
+ "error_type": type(exc).__name__,
270
+ "error_message": str(exc),
271
+ "traceback": full_traceback
272
+ }
273
+ )
274
+ self.log_runner_step(error_step)
275
+ continue
276
+ final = result.final_output.split('</think>', 1)[-1]
277
+ self.chat.append(Message.assistant(final))
278
+ await asyncio.sleep(0)
app/tool.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import datetime
2
+ import json
3
+ import cv2
4
+ import httpx
5
+ from app.config import env
6
+ from app.utils import image_w_box, encode_image
7
+ from agents import RunContextWrapper, function_tool
8
+ from app.memory import Memory,Snapshot
9
+
10
+
11
+
12
+
13
+ def task(name, image):
14
+ resp = httpx.post(f"{env.end_task}",
15
+ data={"name": name},
16
+ files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")},
17
+ timeout=10,
18
+ headers={"Authorization": env.api_key},
19
+ )
20
+ resp.raise_for_status()
21
+ return resp.json()['result']
22
+
23
+ def completion(messages, model):
24
+ response = env.client.chat.completions.create(
25
+ model=model,
26
+ messages=messages
27
+ )
28
+ return response.choices[0].message.content
29
+
30
+
31
+ def completion_image(images, prompt, model):
32
+ messages = [
33
+ {
34
+ "role": "user",
35
+ "content": [
36
+ {"type": "text", "text": prompt},
37
+ {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
38
+ ],
39
+ }
40
+ for b64, mime in map(encode_image, images)
41
+ ]
42
+ return completion(messages, model=model)
43
+
44
+ # ------------------------ Function Tools ------------------------
45
+ @function_tool
46
+ def caption(wrapper: RunContextWrapper[Memory]) -> str:
47
+ """
48
+ Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it.
49
+ Returns:
50
+ str:
51
+ The generated caption for the current view (i.e., the latest frame).
52
+ """
53
+ mem = wrapper.context
54
+ prompt = "Describe the image with rich details but in a concise manner."
55
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
56
+ mem.snapshots.append(Snapshot(sender='caption', data=result))
57
+ return result
58
+
59
+ @function_tool
60
+ def ocr(wrapper: RunContextWrapper[Memory]) -> str:
61
+ """
62
+ Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text.
63
+ Returns:
64
+ str:
65
+ The extracted text from the current view (i.e., the latest frame).
66
+ """
67
+ mem = wrapper.context
68
+ prompt = "Extract all text from image/payslip without miss anything."
69
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
70
+ mem.snapshots.append(Snapshot(sender='ocr', data=result))
71
+ return result
72
+
73
+ @function_tool
74
+ def qa(wrapper: RunContextWrapper[Memory], question: str) -> str:
75
+ """
76
+ Answer a question based on the most recent frame, record it as a snapshot, and return the answer.
77
+
78
+ Args:
79
+ question (str): The question to be answered.
80
+ Returns:
81
+ str:
82
+ The answer to the question based on the current view (i.e., the latest frame).
83
+ """
84
+ mem = wrapper.context
85
+ prompt = f"Answer the question based on the image. Question: {question}"
86
+ result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
87
+ mem.snapshots.append(Snapshot(sender='qa', data=result))
88
+ return result
89
+
90
+
91
+ @function_tool
92
+ def localize(wrapper: RunContextWrapper[Memory]) -> str:
93
+ """
94
+ Localize all objects in the most recent frame
95
+ Returns:
96
+ str:
97
+ The localization result for the current view (i.e., the latest frame).
98
+ the format is {name:list of bboxes}
99
+ """
100
+ mem = wrapper.context
101
+ frame = mem.frames[-1]
102
+ _, img = cv2.imencode('.jpg', frame)
103
+ objxbox = task(env.model_loc, img)
104
+ mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox)))
105
+ return json.dumps(objxbox, indent=2)
106
+
107
+
108
+ @function_tool
109
+ def time(wrapper: RunContextWrapper[Memory]) -> str:
110
+ """
111
+ Get the current time, record it as a snapshot, and return the time.
112
+ Returns:
113
+ str:
114
+ The current time.
115
+ """
116
+ mem = wrapper.context
117
+ result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
118
+ mem.snapshots.append(Snapshot(sender='time', data=result))
119
+ return result
120
+
121
+ def sample_frames(mem: Memory, n: int) -> list:
122
+ """
123
+ Sample frames from the past n seconds of video.
124
+
125
+ Args:
126
+ mem (Memory): The memory context containing frames.
127
+ n (int): Number of seconds to look back for video frames.
128
+ Returns:
129
+ list: Sampled frames from the video sequence.
130
+ """
131
+ if len(mem.frames) == 0:
132
+ return []
133
+
134
+ available_frames = min(n * env.fps, len(mem.frames))
135
+ recent_frames = mem.frames[-available_frames:]
136
+ sampled_frames = recent_frames[::env.fps // 2]
137
+
138
+ return sampled_frames
139
+
140
+ @function_tool
141
+ def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str:
142
+ """
143
+ Generate a descriptive caption for a video sequence from the past n seconds of frames.
144
+ The n is a required parameter that specifies how many seconds of video frames to consider.
145
+
146
+ Args:
147
+ n (int): Number of seconds to look back for video frames.
148
+ Returns:
149
+ str:
150
+ The generated caption for the video sequence from the past n seconds.
151
+ """
152
+ mem = wrapper.context
153
+ sampled_frames = sample_frames(mem, n)
154
+
155
+ if len(sampled_frames) == 0:
156
+ return "No frames available for video caption."
157
+
158
+ prompt = "Describe this video sequence focusing on any changes or actions that occur over time."
159
+ result = completion_image(sampled_frames, prompt, env.model_mllm)
160
+ mem.snapshots.append(Snapshot(sender='video caption', data=result))
161
+ return result
162
+
163
+ @function_tool
164
+ def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str:
165
+ """
166
+ Answer a question based on a video sequence from the past n seconds of frames.
167
+
168
+ Args:
169
+ question (str): The question to be answered.
170
+ n (int): Number of seconds to look back for video frames.
171
+ Returns:
172
+ str:
173
+ The answer to the question based on the video sequence from the past n seconds.
174
+ """
175
+ mem = wrapper.context
176
+ sampled_frames = sample_frames(mem, n)
177
+
178
+ if len(sampled_frames) == 0:
179
+ return "No frames available for video Q&A."
180
+
181
+ prompt = f"Answer the question based on this video sequence. Question: {question}"
182
+ result = completion_image(sampled_frames, prompt, env.model_mllm)
183
+ mem.snapshots.append(Snapshot(sender='video qa', data=result))
184
+ return result
app/utils.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from PIL import Image
2
+ import base64
3
+ import supervision as sv
4
+ import numpy as np
5
+ import cv2
6
+ colors = sv.ColorPalette.from_hex(
7
+ [
8
+ "#a1c9f4",
9
+ "#ffb482",
10
+ "#8de5a1",
11
+ "#ff9f9b",
12
+ "#d0bbff",
13
+ "#debb9b",
14
+ "#fab0e4",
15
+ "#cfcfcf",
16
+ "#fffea3",
17
+ "#b9f2f0",
18
+ "#a1c9f4",
19
+ "#ffb482",
20
+ "#8de5a1",
21
+ "#ff9f9b",
22
+ "#d0bbff",
23
+ "#debb9b",
24
+ "#fab0e4",
25
+ "#cfcfcf",
26
+ "#fffea3",
27
+ "#b9f2f0",
28
+ ]
29
+ )
30
+
31
+ def image_w_box(image,objxbox):
32
+
33
+ box_annotator = sv.BoxCornerAnnotator(thickness=10, corner_length=30, color=colors)
34
+ label_annotator = sv.LabelAnnotator(color=colors)
35
+ mask_annotator = sv.MaskAnnotator(opacity=0.2, color=colors)
36
+
37
+ xyxys = np.array([v for boxes in objxbox.values() for v in boxes])
38
+ unique_labels = sorted(objxbox.keys())
39
+ class_id_map = dict(enumerate(unique_labels))
40
+ labels = [l for l, boxes in objxbox.items() for _ in boxes]
41
+ class_id = [list(class_id_map.values()).index(label) for label in labels]
42
+
43
+ masks = np.zeros((len(xyxys), image.shape[0], image.shape[1]), dtype=bool)
44
+ for i, (x1, y1, x2, y2) in enumerate(xyxys):
45
+ masks[i, int(y1):int(y2), int(x1):int(x2)] = labels[i]
46
+
47
+ if len(xyxys) == 0:
48
+ return image
49
+ detections = sv.Detections(
50
+ xyxy=xyxys,
51
+ mask=masks,
52
+ class_id=np.array(class_id),
53
+ )
54
+ # Convert RGB to BGR for annotation
55
+ image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
56
+ # After annotation, convert back to RGB
57
+ annotated_image = box_annotator.annotate(scene=image_bgr.copy(), detections=detections)
58
+ annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
59
+ annotated_image = mask_annotator.annotate(scene=annotated_image, detections=detections)
60
+
61
+ return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
62
+
63
+
64
+ def encode_image(img) -> tuple[str, str]:
65
+ arr = np.array(img.convert("RGB")) if isinstance(img, Image.Image) else img
66
+ if not isinstance(arr, np.ndarray):
67
+ raise ValueError("Unsupported image type")
68
+ ok, buf = cv2.imencode('.jpg', cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
69
+ if not ok:
70
+ raise ValueError("Encoding failed")
71
+ b64 = base64.b64encode(buf).decode('utf-8')
72
+ return b64, "image/jpeg"
main.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import os
3
+ import cv2
4
+ import gradio as gr
5
+ from fastrtc import Stream,WebRTC
6
+ from app.config import env
7
+ from fastrtc import AdditionalOutputs
8
+ from app.memory import Memory,Message
9
+ from fastrtc import get_cloudflare_turn_credentials
10
+ from app.agent import build_agent
11
+ from fastrtc import get_current_context
12
+ session_memories = {}
13
+
14
+ def get_session_memory(session_id: str = None) -> Memory:
15
+ if session_id not in session_memories:
16
+ session_memories[session_id] = Memory(build_agent())
17
+ welcome_message = "👋 Now I can see. Feel free to ask me about anything!"
18
+ session_memories[session_id].chat.append(Message.assistant(welcome_message))
19
+ return session_memories[session_id]
20
+
21
+ def video_handler(frame):
22
+ frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
23
+ rtcid = get_current_context().webrtc_id
24
+ mem = get_session_memory(rtcid)
25
+ if (snapshot := mem.enqueue(frame)):
26
+ mem.chat.append(Message.tool(snapshot.gr, title=snapshot.sender, status='done'))
27
+ return frame, AdditionalOutputs(mem.chat.messages, rtcid)
28
+
29
+ def chat_handler(text, webrtc_state):
30
+ if webrtc_state is None:
31
+ return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state
32
+
33
+ mem = get_session_memory(webrtc_state)
34
+ if not mem.is_running:
35
+ mem.receive(text.strip())
36
+ return "", mem.chat.messages, webrtc_state
37
+
38
+
39
+
40
+
41
+
42
+ if __name__ == "__main__":
43
+ print("🚀 Starting Perceptual Copilot...")
44
+ print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
45
+ print(f"Environment check - API_KEY: {'✓' if env.api_key else '✗'}")
46
+ print(f"Environment check - END_LANG: {'✓' if env.end_lang else '✗'}")
47
+ print(f"Environment check - OpenAI Client: {'✓' if env.client else '✗'}")
48
+
49
+
50
+
51
+ with gr.Blocks(
52
+ title="🤖 Perceptual Copilot - AI Vision Assistant",
53
+ theme=gr.themes.Soft(
54
+ primary_hue="blue",
55
+ secondary_hue="orange",
56
+ neutral_hue="slate",
57
+ font=("system-ui", "sans-serif")
58
+ ),
59
+ css=Path("styles.css").read_text(),
60
+ ) as demo:
61
+
62
+ # Header section with sleek styling
63
+ gr.Markdown("""
64
+ <div class="ultra-sleek-header">
65
+ <h1 class="hero-title">
66
+ <span class="title-primary">Perceptual</span>
67
+ <span class="title-accent">Copilot</span>
68
+ </h1>
69
+ <p class="hero-subtitle">
70
+ <span class="status-dot"></span>
71
+ An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams.
72
+ </p>
73
+ <div class="feature-pills">
74
+ <span class="pill">Real-time streaming</span>
75
+ <span class="pill">Visual Agent</span>
76
+ <span class="pill">Large vision language model</span>
77
+ <span class="pill">Reasoning</span>
78
+ </div>
79
+ </div>
80
+ """, elem_classes="ultra-sleek-header")
81
+
82
+ state = gr.State(value=None)
83
+
84
+ # Main interface with improved layout
85
+ with gr.Row(equal_height=True):
86
+ with gr.Column(scale=1, elem_classes="video-container"):
87
+ video = WebRTC(
88
+ label="🎥 Camera Stream",
89
+ rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
90
+ track_constraints={
91
+ "width": {"exact": 600},
92
+ "height": {"exact": 600},
93
+ "aspectRatio": {"exact": 1}},
94
+ mode="send",
95
+ modality="video",
96
+ mirror_webcam=True,
97
+ width=600,
98
+ height=600,
99
+ )
100
+
101
+ with gr.Column(scale=1, elem_classes="chat-container"):
102
+ gr.Markdown("### 💬 Chat")
103
+ chatbot = gr.Chatbot(
104
+ type="messages",
105
+ height=450,
106
+ label="🤖 AI Assistant",
107
+ placeholder="Chat history will appear here...",
108
+ show_label=False,
109
+ )
110
+
111
+ with gr.Row(elem_classes="items-center"):
112
+ textbox = gr.Textbox(
113
+ placeholder="💭 Question goes here, press ENTER to send",
114
+ lines=1,
115
+ show_label=False,
116
+ )
117
+ # Event handlers
118
+ video.stream(
119
+ fn=video_handler,
120
+ inputs=[video],
121
+ outputs=[video],
122
+ concurrency_limit=10,
123
+ )
124
+ video.on_additional_outputs(
125
+ fn=lambda messages, webrtc_id: (messages, webrtc_id),
126
+ outputs=[chatbot, state]
127
+ )
128
+
129
+ # Chat handler for textbox
130
+ textbox.submit(
131
+ chat_handler,
132
+ inputs=[textbox, state],
133
+ outputs=[textbox, chatbot, state]
134
+ )
135
+
136
+ # Enhanced instructions section
137
+ with gr.Column(elem_classes="instructions-container"):
138
+ gr.Markdown("""
139
+ ## 🚀 Get Started
140
+
141
+ **📌 Quick Reminder:**
142
+ 1. Allow camera access when prompted
143
+ 2. Wait for the camera to initialize and first message to appear
144
+ 3. 💡 **Tip:** If you find it hard to see the interface, please turn off night mode for better visibility
145
+ """)
146
+
147
+ with gr.Row():
148
+ with gr.Column():
149
+ gr.Markdown("""
150
+ ### 💡 Example Prompts
151
+
152
+ **🌍 General Vision:**
153
+ - *"What do you see in front of me?"*
154
+ - *"What's the overall environment like?"*
155
+
156
+ **📄 Text & Documents:**
157
+ - *"Read the text in this document"*
158
+ - *"Extract the code snippet from this image"*
159
+
160
+ **🔍 Object Recognition:**
161
+ - *"What objects are visible?"*
162
+ - *"Help me identify this item"*
163
+ """)
164
+
165
+ with gr.Column():
166
+ gr.Markdown("""
167
+ ### 🔧 Current Capabilities
168
+
169
+ **🚀 Available Features:**
170
+ - **OCR** - Text extraction and reading
171
+ - **Q&A** - Visual question answering
172
+ - **Caption** - Scene description and analysis
173
+ - **Localization** - Object detection and positioning
174
+ - **Time** - Current time and temporal context
175
+
176
+ **📈 More Coming Soon:**
177
+ We're continuously adding new capabilities to enhance your visual AI experience.
178
+
179
+ **⚠️ Important Note:**
180
+ All models are self-hosted. Please avoid abuse of the system.
181
+ """)
182
+ demo.queue(default_concurrency_limit=None)
183
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ huggingface_hub
2
+ python-dotenv
3
+ supervision
4
+ openai-agents
5
+ fastrtc==0.0.21
6
+ gradio
7
+ pydantic==2.10.6
styles.css ADDED
@@ -0,0 +1,391 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .gradio-container {
2
+ background: linear-gradient(135deg, #fefefe 0%, #f8f6f0 100%) !important;
3
+ font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
4
+ }
5
+
6
+ .main-header {
7
+ background: rgba(255, 255, 255, 0.95);
8
+ border-radius: 15px;
9
+ padding: 20px;
10
+ margin: 10px 0;
11
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
12
+ backdrop-filter: blur(10px);
13
+ border: 1px solid rgba(255, 255, 255, 0.2);
14
+ }
15
+
16
+ .video-container {
17
+ background: rgba(255, 255, 255, 0.9);
18
+ border-radius: 12px;
19
+ padding: 20px;
20
+ box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
21
+ backdrop-filter: blur(8px);
22
+ border: 1px solid rgba(255, 255, 255, 0.3);
23
+ }
24
+
25
+ .chat-container {
26
+ background: rgba(255, 255, 255, 0.9);
27
+ border-radius: 12px;
28
+ padding: 20px;
29
+ box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
30
+ backdrop-filter: blur(8px);
31
+ border: 1px solid rgba(255, 255, 255, 0.3);
32
+ }
33
+
34
+ .chat-container textarea {
35
+ border: 3px solid #4285f4 !important;
36
+ border-radius: 12px !important;
37
+ padding: 15px !important;
38
+ font-size: 16px !important;
39
+ font-weight: 500 !important;
40
+ background: #ffffff !important;
41
+ box-shadow: 0 4px 20px rgba(66, 133, 244, 0.15) !important;
42
+ transition: all 0.3s ease !important;
43
+ min-height: 50px !important;
44
+ display: flex !important;
45
+ align-items: center !important;
46
+ justify-content: center !important;
47
+ resize: none !important;
48
+ }
49
+
50
+ .chat-container textarea:focus {
51
+ border-color: #1a73e8 !important;
52
+ box-shadow: 0 6px 25px rgba(66, 133, 244, 0.3), 0 0 0 3px rgba(66, 133, 244, 0.1) !important;
53
+ outline: none !important;
54
+ transform: translateY(-2px) !important;
55
+ }
56
+
57
+ .chat-container textarea::placeholder {
58
+ color: #5f6368 !important;
59
+ font-weight: 400 !important;
60
+ font-size: 15px !important;
61
+ }
62
+
63
+ .chat-container .gr-text-input {
64
+ border: none !important;
65
+ background: transparent !important;
66
+ }
67
+
68
+ .instructions-container {
69
+ background: rgba(255, 255, 255, 0.9);
70
+ border-radius: 12px;
71
+ padding: 25px;
72
+ margin: 20px 0;
73
+ box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
74
+ backdrop-filter: blur(8px);
75
+ border: 1px solid rgba(255, 255, 255, 0.3);
76
+ }
77
+
78
+ .feature-card {
79
+ background: linear-gradient(135deg, #ff6b6b, #feca57);
80
+ border-radius: 10px;
81
+ padding: 15px;
82
+ margin: 10px 0;
83
+ color: white;
84
+ box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
85
+ }
86
+
87
+ .status-indicator {
88
+ display: inline-block;
89
+ width: 10px;
90
+ height: 10px;
91
+ background: #4CAF50;
92
+ border-radius: 50%;
93
+ margin-right: 8px;
94
+ animation: pulse 2s infinite;
95
+ }
96
+
97
+ @keyframes pulse {
98
+ 0% { opacity: 1; }
99
+ 50% { opacity: 0.5; }
100
+ 100% { opacity: 1; }
101
+ }
102
+
103
+ .title-emoji {
104
+ font-size: 2em;
105
+ margin-right: 10px;
106
+ vertical-align: middle;
107
+ }
108
+
109
+ /* Hide Gradio branding and footer elements */
110
+ .gradio-container footer {
111
+ display: none !important;
112
+ }
113
+
114
+ /* Hide "Use via API" button */
115
+ .gradio-container .api-docs {
116
+ display: none !important;
117
+ }
118
+
119
+ /* Hide "Built with Gradio" logo */
120
+ .gradio-container .built-with {
121
+ display: none !important;
122
+ }
123
+
124
+ /* Hide Settings button */
125
+ .gradio-container .settings {
126
+ display: none !important;
127
+ }
128
+
129
+ /* Hide the entire footer area */
130
+ .gradio-container .footer {
131
+ display: none !important;
132
+ }
133
+
134
+ /* Alternative selectors for different Gradio versions */
135
+ .gradio-container div[data-testid="footer"] {
136
+ display: none !important;
137
+ }
138
+
139
+ .gradio-container .gradio-footer {
140
+ display: none !important;
141
+ }
142
+
143
+ /* Hide any elements containing gradio branding text */
144
+ .gradio-container a[href*="gradio"] {
145
+ display: none !important;
146
+ }
147
+
148
+ .gradio-container div:has-text("Built with") {
149
+ display: none !important;
150
+ }
151
+
152
+ .gradio-container div:has-text("Use via API") {
153
+ display: none !important;
154
+ }
155
+
156
+ /* Ultra-sleek header styling with modern design */
157
+ .ultra-sleek-header {
158
+ text-align: center;
159
+ padding: 20px 20px 5px 20px;
160
+ margin: 0;
161
+ background: transparent;
162
+ border: none;
163
+ position: relative;
164
+ overflow: hidden;
165
+ }
166
+
167
+ .ultra-sleek-header::before {
168
+ content: '';
169
+ position: absolute;
170
+ top: 0;
171
+ left: 0;
172
+ right: 0;
173
+ bottom: 0;
174
+ background: radial-gradient(ellipse at center top, rgba(99, 102, 241, 0.05) 0%, transparent 70%);
175
+ pointer-events: none;
176
+ }
177
+
178
+
179
+ .badge-icon {
180
+ font-size: 1rem;
181
+ animation: float 3s ease-in-out infinite;
182
+ }
183
+
184
+ @keyframes float {
185
+
186
+ 0%,
187
+ 100% {
188
+ transform: translateY(0px);
189
+ }
190
+
191
+ 50% {
192
+ transform: translateY(-4px);
193
+ }
194
+ }
195
+
196
+ .hero-title {
197
+ font-size: 4rem;
198
+ font-weight: 800;
199
+ margin: 10px 0 5px 0;
200
+ line-height: 1.1;
201
+ letter-spacing: -0.03em;
202
+ position: relative;
203
+ z-index: 1;
204
+ }
205
+
206
+ .title-primary {
207
+ background: linear-gradient(45deg,
208
+ #4169E1 0%,
209
+ #8A2BE2 50%,
210
+ #E91E63 100%);
211
+ -webkit-background-clip: text;
212
+ -webkit-text-fill-color: transparent;
213
+ background-clip: text;
214
+ position: relative;
215
+ }
216
+ .title-primary::after {
217
+ content: '';
218
+ position: absolute;
219
+ bottom: -8px;
220
+ left: 0;
221
+ right: 0;
222
+ height: 4px;
223
+ background: linear-gradient(90deg, #6366f1, #8b5cf6, #d946ef);
224
+ border-radius: 2px;
225
+ opacity: 0.6;
226
+ }
227
+
228
+ .title-accent {
229
+ background: linear-gradient(135deg, #1e293b 0%, #0d1e35 100%);
230
+ -webkit-background-clip: text;
231
+ -webkit-text-fill-color: transparent;
232
+ background-clip: text;
233
+ position: relative;
234
+ margin-left: 12px;
235
+ }
236
+
237
+
238
+
239
+ .hero-subtitle {
240
+ font-size: 1.25rem;
241
+ color: #64748b;
242
+ font-weight: 400;
243
+ margin: 15px auto 20px auto;
244
+ line-height: 1.6;
245
+ max-width: 580px;
246
+ display: flex;
247
+ align-items: center;
248
+ justify-content: center;
249
+ gap: 12px;
250
+ }
251
+
252
+ .status-dot {
253
+ display: inline-block;
254
+ width: 8px;
255
+ height: 8px;
256
+ background: #10b981;
257
+ border-radius: 50%;
258
+ position: relative;
259
+ flex-shrink: 0;
260
+ }
261
+
262
+ .status-dot::before {
263
+ content: '';
264
+ position: absolute;
265
+ top: 0;
266
+ left: 0;
267
+ width: 8px;
268
+ height: 8px;
269
+ background: #10b981;
270
+ border-radius: 50%;
271
+ animation: ping 2s cubic-bezier(0, 0, 0.2, 1) infinite;
272
+ transform-origin: center;
273
+ }
274
+
275
+ @keyframes ping {
276
+
277
+ 75%,
278
+ 100% {
279
+ transform: scale(2);
280
+ opacity: 0;
281
+ }
282
+ }
283
+
284
+ .feature-pills {
285
+ display: flex;
286
+ justify-content: center;
287
+ gap: 12px;
288
+ margin-top: 20px;
289
+ flex-wrap: wrap;
290
+ }
291
+
292
+ .pill {
293
+ background: linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(248, 250, 252, 0.9));
294
+ border: 1px solid rgba(148, 163, 184, 0.15);
295
+ border-radius: 25px;
296
+ padding: 6px 14px;
297
+ font-size: 0.8rem;
298
+ font-weight: 600;
299
+ color: #475569;
300
+ backdrop-filter: blur(12px);
301
+ transition: all 0.25s cubic-bezier(0.4, 0, 0.2, 1);
302
+ white-space: nowrap;
303
+ box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
304
+ letter-spacing: 0.025em;
305
+ }
306
+
307
+ .pill:hover {
308
+ background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.05));
309
+ border-color: rgba(99, 102, 241, 0.25);
310
+ color: #6366f1;
311
+ transform: translateY(-1px) scale(1.02);
312
+ box-shadow: 0 4px 12px rgba(99, 102, 241, 0.15), 0 2px 4px rgba(0, 0, 0, 0.05);
313
+ }
314
+
315
+ /* Ensure proper vertical alignment for chat input row */
316
+ .items-center {
317
+ display: flex !important;
318
+ align-items: center !important;
319
+ gap: 8px !important;
320
+ }
321
+
322
+ /* Responsive design for mobile */
323
+ @media (max-width: 768px) {
324
+ .ultra-sleek-header {
325
+ padding: 15px 15px 20px 15px;
326
+ }
327
+
328
+ .hero-title {
329
+ font-size: 3rem;
330
+ }
331
+
332
+ .hero-subtitle {
333
+ font-size: 1.125rem;
334
+ padding: 0 15px;
335
+ flex-direction: column;
336
+ gap: 8px;
337
+ }
338
+
339
+ .feature-pills {
340
+ gap: 8px;
341
+ margin-top: 24px;
342
+ }
343
+
344
+ .pill {
345
+ font-size: 0.8rem;
346
+ padding: 6px 12px;
347
+ }
348
+ }
349
+
350
+ @media (max-width: 480px) {
351
+ .hero-title {
352
+ font-size: 2.5rem;
353
+ line-height: 1.2;
354
+ }
355
+
356
+ .title-accent {
357
+ margin-left: 8px;
358
+ }
359
+ }
360
+ .gradio-container .chatbot .message-wrap button[aria-label*="clear" i],
361
+ .gradio-container .chatbot .message-wrap button[title*="clear" i] {
362
+ display: none !important;
363
+ }
364
+
365
+ /* Target any button with clear-related text content */
366
+ button:contains("Clear"),
367
+ button:contains("clear"),
368
+ button:contains("CLEAR") {
369
+ display: none !important;
370
+ }
371
+
372
+ /* Target buttons in chat interfaces specifically */
373
+ .chat-container button[aria-label*="clear" i],
374
+ .chatbot button[aria-label*="clear" i],
375
+ .message-wrap button[aria-label*="clear" i] {
376
+ display: none !important;
377
+ }
378
+
379
+ /* Hide buttons with specific SVG icons that might represent clear/delete */
380
+ button svg[data-testid*="clear"],
381
+ button svg[data-testid*="delete"],
382
+ button svg[data-testid*="trash"] {
383
+ display: none !important;
384
+ }
385
+
386
+ /* Hide parent button if it contains clear-related SVG */
387
+ button:has(svg[data-testid*="clear"]),
388
+ button:has(svg[data-testid*="delete"]),
389
+ button:has(svg[data-testid*="trash"]) {
390
+ display: none !important;
391
+ }