Spaces:
Running
Running
Commit
·
c564b63
0
Parent(s):
Fresh deploy: all latest files
Browse files- .gitignore +194 -0
- LICENSE +21 -0
- README.md +93 -0
- app/agent.py +27 -0
- app/config.py +40 -0
- app/memory.py +278 -0
- app/tool.py +184 -0
- app/utils.py +72 -0
- main.py +183 -0
- requirements.txt +7 -0
- styles.css +391 -0
.gitignore
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Byte-compiled / optimized / DLL files
|
2 |
+
__pycache__/
|
3 |
+
*.py[cod]
|
4 |
+
*$py.class
|
5 |
+
archive/
|
6 |
+
# C extensions
|
7 |
+
*.so
|
8 |
+
test*
|
9 |
+
# Distribution / packaging
|
10 |
+
.Python
|
11 |
+
build/
|
12 |
+
develop-eggs/
|
13 |
+
dist/
|
14 |
+
downloads/
|
15 |
+
eggs/
|
16 |
+
.eggs/
|
17 |
+
lib/
|
18 |
+
lib64/
|
19 |
+
parts/
|
20 |
+
sdist/
|
21 |
+
var/
|
22 |
+
wheels/
|
23 |
+
share/python-wheels/
|
24 |
+
*.egg-info/
|
25 |
+
.installed.cfg
|
26 |
+
*.egg
|
27 |
+
MANIFEST
|
28 |
+
|
29 |
+
# PyInstaller
|
30 |
+
# Usually these files are written by a python script from a template
|
31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
32 |
+
*.manifest
|
33 |
+
*.spec
|
34 |
+
|
35 |
+
# Installer logs
|
36 |
+
pip-log.txt
|
37 |
+
pip-delete-this-directory.txt
|
38 |
+
|
39 |
+
# Unit test / coverage reports
|
40 |
+
htmlcov/
|
41 |
+
.tox/
|
42 |
+
.nox/
|
43 |
+
.coverage
|
44 |
+
.coverage.*
|
45 |
+
.cache
|
46 |
+
nosetests.xml
|
47 |
+
coverage.xml
|
48 |
+
*.cover
|
49 |
+
*.py,cover
|
50 |
+
.hypothesis/
|
51 |
+
.pytest_cache/
|
52 |
+
cover/
|
53 |
+
|
54 |
+
# Translations
|
55 |
+
*.mo
|
56 |
+
*.pot
|
57 |
+
|
58 |
+
# Django stuff:
|
59 |
+
*.log
|
60 |
+
local_settings.py
|
61 |
+
db.sqlite3
|
62 |
+
db.sqlite3-journal
|
63 |
+
|
64 |
+
# Flask stuff:
|
65 |
+
instance/
|
66 |
+
.webassets-cache
|
67 |
+
|
68 |
+
# Scrapy stuff:
|
69 |
+
.scrapy
|
70 |
+
|
71 |
+
# Sphinx documentation
|
72 |
+
docs/_build/
|
73 |
+
|
74 |
+
# PyBuilder
|
75 |
+
.pybuilder/
|
76 |
+
target/
|
77 |
+
|
78 |
+
# Jupyter Notebook
|
79 |
+
.ipynb_checkpoints
|
80 |
+
|
81 |
+
# IPython
|
82 |
+
profile_default/
|
83 |
+
ipython_config.py
|
84 |
+
|
85 |
+
# pyenv
|
86 |
+
# For a library or package, you might want to ignore these files since the code is
|
87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
88 |
+
# .python-version
|
89 |
+
|
90 |
+
# pipenv
|
91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
94 |
+
# install all needed dependencies.
|
95 |
+
#Pipfile.lock
|
96 |
+
|
97 |
+
# UV
|
98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
100 |
+
# commonly ignored for libraries.
|
101 |
+
#uv.lock
|
102 |
+
|
103 |
+
# poetry
|
104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
106 |
+
# commonly ignored for libraries.
|
107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
108 |
+
#poetry.lock
|
109 |
+
|
110 |
+
# pdm
|
111 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
112 |
+
#pdm.lock
|
113 |
+
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
114 |
+
# in version control.
|
115 |
+
# https://pdm.fming.dev/latest/usage/project/#working-with-version-control
|
116 |
+
.pdm.toml
|
117 |
+
.pdm-python
|
118 |
+
.pdm-build/
|
119 |
+
|
120 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
121 |
+
__pypackages__/
|
122 |
+
|
123 |
+
# Celery stuff
|
124 |
+
celerybeat-schedule
|
125 |
+
celerybeat.pid
|
126 |
+
|
127 |
+
# SageMath parsed files
|
128 |
+
*.sage.py
|
129 |
+
|
130 |
+
# Environments
|
131 |
+
.env
|
132 |
+
.venv
|
133 |
+
env/
|
134 |
+
venv/
|
135 |
+
ENV/
|
136 |
+
env.bak/
|
137 |
+
venv.bak/
|
138 |
+
|
139 |
+
# Spyder project settings
|
140 |
+
.spyderproject
|
141 |
+
.spyproject
|
142 |
+
|
143 |
+
# Rope project settings
|
144 |
+
.ropeproject
|
145 |
+
|
146 |
+
# mkdocs documentation
|
147 |
+
/site
|
148 |
+
|
149 |
+
# mypy
|
150 |
+
.mypy_cache/
|
151 |
+
.dmypy.json
|
152 |
+
dmypy.json
|
153 |
+
|
154 |
+
# Pyre type checker
|
155 |
+
.pyre/
|
156 |
+
|
157 |
+
# pytype static type analyzer
|
158 |
+
.pytype/
|
159 |
+
|
160 |
+
# Cython debug symbols
|
161 |
+
cython_debug/
|
162 |
+
|
163 |
+
# PyCharm
|
164 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
165 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
166 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
167 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
168 |
+
#.idea/
|
169 |
+
|
170 |
+
# Abstra
|
171 |
+
# Abstra is an AI-powered process automation framework.
|
172 |
+
# Ignore directories containing user credentials, local state, and settings.
|
173 |
+
# Learn more at https://abstra.io/docs
|
174 |
+
.abstra/
|
175 |
+
|
176 |
+
# Visual Studio Code
|
177 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
178 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
179 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
180 |
+
# you could uncomment the following to ignore the enitre vscode folder
|
181 |
+
# .vscode/
|
182 |
+
|
183 |
+
# Ruff stuff:
|
184 |
+
.ruff_cache/
|
185 |
+
|
186 |
+
# PyPI configuration file
|
187 |
+
.pypirc
|
188 |
+
|
189 |
+
# Cursor
|
190 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
191 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
192 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
193 |
+
.cursorignore
|
194 |
+
.cursorindexingignore
|
LICENSE
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Copyright (c) 2025 Jing Bi
|
4 |
+
|
5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
6 |
+
of this software and associated documentation files (the "Software"), to deal
|
7 |
+
in the Software without restriction, including without limitation the rights
|
8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
9 |
+
copies of the Software, and to permit persons to whom the Software is
|
10 |
+
furnished to do so, subject to the following conditions:
|
11 |
+
|
12 |
+
The above copyright notice and this permission notice shall be included in all
|
13 |
+
copies or substantial portions of the Software.
|
14 |
+
|
15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
21 |
+
SOFTWARE.
|
README.md
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
title: Perceptual Copilot
|
3 |
+
emoji: 👁️
|
4 |
+
colorFrom: yellow
|
5 |
+
colorTo: purple
|
6 |
+
sdk: gradio
|
7 |
+
sdk_version: 5.33.1
|
8 |
+
app_file: main.py
|
9 |
+
pinned: false
|
10 |
+
license: mit
|
11 |
+
---
|
12 |
+
|
13 |
+
## ✨ What is Perceptual Copilot?
|
14 |
+
|
15 |
+
Perceptual Copilot is a prototype that demonstrates the integration of OpenAI agents with visual tools to process real-time video streams. This experimental platform showcases both the promising potential and current limitations of equipping agents with vision capabilities to understand and interact with live visual data.
|
16 |
+
|
17 |
+
|
18 |
+
### Architecture Overview
|
19 |
+
|
20 |
+
|
21 |
+
|
22 |
+
```
|
23 |
+
┌─────────────────┐ ┌─────────────────┐ ┌─────────────────┐
|
24 |
+
│ Webcam │───▶│ Memory │◀──▶│ Gradio │
|
25 |
+
└─────────────────┘ └─────────────────┘ └─────────────────┘
|
26 |
+
│
|
27 |
+
▼
|
28 |
+
┌─────────────────┐ ┌─────────────────┐
|
29 |
+
│ Agent │◀──▶│ Tools │
|
30 |
+
└─────────────────┘ └─────────────────┘
|
31 |
+
```
|
32 |
+
|
33 |
+
### Available Tools
|
34 |
+
|
35 |
+
| Tool | Description | Output |
|
36 |
+
|------|-------------|---------|
|
37 |
+
| `caption` | Generate detailed image descriptions | Rich visual descriptions |
|
38 |
+
| `ocr` | Extract text from images | Extracted text content |
|
39 |
+
| `localize` | Detect and locate objects | Bounding boxes with labels |
|
40 |
+
| `qa` | Answer questions about images | Contextual answers |
|
41 |
+
| `time` | Get current timestamp | Current date and time |
|
42 |
+
| _More tools coming soon..._ | Additional capabilities in development | Various outputs |
|
43 |
+
|
44 |
+
## 🚀 Quick Start
|
45 |
+
|
46 |
+
### Prerequisites
|
47 |
+
|
48 |
+
- Webcam access
|
49 |
+
|
50 |
+
### Installation
|
51 |
+
|
52 |
+
1. **Install dependencies**
|
53 |
+
```bash
|
54 |
+
pip install -r requirements.txt
|
55 |
+
```
|
56 |
+
|
57 |
+
2. **Set up environment variables**
|
58 |
+
```bash
|
59 |
+
export HF_TOKEN="your_huggingface_token"
|
60 |
+
export API_KEY="your_openai_api_key"
|
61 |
+
export END_LANG="your_llm_endpoint"
|
62 |
+
export END_TASK="your_task_endpoint"
|
63 |
+
export MODEL_AGENT="your_agent_model"
|
64 |
+
export MODEL_MLLM="your_multimodal_model"
|
65 |
+
export MODEL_LOC="your_localization_model"
|
66 |
+
```
|
67 |
+
|
68 |
+
3. **Launch the application**
|
69 |
+
```bash
|
70 |
+
python main.py
|
71 |
+
```
|
72 |
+
|
73 |
+
## 💡 Usage Examples
|
74 |
+
|
75 |
+
### Basic Interaction
|
76 |
+
- **User**: "What do you see?"
|
77 |
+
- **Assistant**: *Generates detailed caption of current view*
|
78 |
+
|
79 |
+
### OCR Functionality
|
80 |
+
- **User**: "Read the text in this document"
|
81 |
+
- **Assistant**: *Extracts and returns all visible text*
|
82 |
+
|
83 |
+
### Object Detection
|
84 |
+
- **User**: "What objects are in front of me?"
|
85 |
+
- **Assistant**: *Identifies and localizes objects with bounding boxes*
|
86 |
+
|
87 |
+
|
88 |
+
## Acknowledgments
|
89 |
+
|
90 |
+
- Built with [Gradio](https://gradio.app/) for the interactive web interface
|
91 |
+
- Uses [Supervision](https://supervision.roboflow.com/) for frame annotation
|
92 |
+
- WebRTC integration via [FastRTC](https://github.com/gradio-app/gradio)
|
93 |
+
|
app/agent.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
from agents import Agent
|
3 |
+
from app.memory import Memory
|
4 |
+
from openai import AsyncOpenAI
|
5 |
+
from app.config import env
|
6 |
+
from agents import set_default_openai_client, set_default_openai_api, set_tracing_disabled
|
7 |
+
from app.tool import caption, ocr, localize, qa, time, video_caption, video_qa
|
8 |
+
|
9 |
+
def build_agent():
|
10 |
+
client = AsyncOpenAI(base_url=env.end_lang,api_key=env.api_key)
|
11 |
+
set_default_openai_client(client=client, use_for_tracing=False)
|
12 |
+
set_default_openai_api("chat_completions")
|
13 |
+
set_tracing_disabled(disabled=True)
|
14 |
+
chat_agent = Agent[Memory](
|
15 |
+
name="Assistant",
|
16 |
+
tools=[caption, ocr, qa, time, localize, video_caption, video_qa],
|
17 |
+
model=env.model_agent,
|
18 |
+
instructions=(
|
19 |
+
"As a helpful assistant, your functions include answering questions about images, "
|
20 |
+
"Optical Character Recognition (OCR), image caption generation, object localization "
|
21 |
+
"within images, and video caption generation and Q&A. For video-related tools, you "
|
22 |
+
"will need to determine the appropriate time window to analyze from the past."
|
23 |
+
),
|
24 |
+
)
|
25 |
+
|
26 |
+
return chat_agent
|
27 |
+
|
app/config.py
ADDED
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import logging
|
3 |
+
from openai import OpenAI
|
4 |
+
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
try:
|
9 |
+
from dotenv import load_dotenv
|
10 |
+
load_dotenv()
|
11 |
+
except ImportError:
|
12 |
+
pass
|
13 |
+
|
14 |
+
|
15 |
+
class Envs:
|
16 |
+
def __init__(self):
|
17 |
+
self.hf_token = os.getenv("HF_TOKEN")
|
18 |
+
self.api_key = os.getenv("API_KEY")
|
19 |
+
self.end_task = os.getenv("END_TASK")
|
20 |
+
self.end_lang = os.getenv("END_LANG")
|
21 |
+
self.model_agent = os.getenv("MODEL_AGENT")
|
22 |
+
self.model_mllm = os.getenv("MODEL_MLLM")
|
23 |
+
self.model_loc = os.getenv("MODEL_LOC")
|
24 |
+
|
25 |
+
# Only initialize OpenAI client if we have the required env vars
|
26 |
+
if self.end_lang and self.api_key:
|
27 |
+
self.client = OpenAI(base_url=self.end_lang, api_key=self.api_key)
|
28 |
+
else:
|
29 |
+
self.client = None
|
30 |
+
print("WARNING: OpenAI client not initialized due to missing environment variables")
|
31 |
+
|
32 |
+
self.debug = os.getenv("DEBUG", "1").lower() in ("true", "1", "yes")
|
33 |
+
self.fps = int(os.getenv("FPS"))
|
34 |
+
|
35 |
+
|
36 |
+
env = Envs()
|
37 |
+
|
38 |
+
logger = logging.getLogger('copilot')
|
39 |
+
logger.setLevel(logging.DEBUG if env.debug else logging.INFO)
|
40 |
+
logger.addHandler(logging.StreamHandler())
|
app/memory.py
ADDED
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import asyncio
|
2 |
+
from dataclasses import dataclass, field
|
3 |
+
from agents import Runner, RunHooks
|
4 |
+
import threading
|
5 |
+
from typing import Any, Dict, Optional, List
|
6 |
+
import traceback
|
7 |
+
import time
|
8 |
+
from datetime import datetime
|
9 |
+
import numpy as np
|
10 |
+
import gradio as gr
|
11 |
+
|
12 |
+
from .config import logger, env
|
13 |
+
@dataclass
|
14 |
+
class RunnerStep:
|
15 |
+
"""Log entry for a single Runner step"""
|
16 |
+
timestamp: str
|
17 |
+
step_type: str
|
18 |
+
agent_name: str
|
19 |
+
turn_number: int
|
20 |
+
details: Dict[str, Any] = field(default_factory=dict)
|
21 |
+
duration_ms: Optional[float] = None
|
22 |
+
|
23 |
+
def __str__(self) -> str:
|
24 |
+
return f"[{self.timestamp}][T{self.turn_number}][{self.step_type}]: {self.details}"
|
25 |
+
|
26 |
+
@dataclass
|
27 |
+
class Message:
|
28 |
+
role: str
|
29 |
+
content: str
|
30 |
+
mode: str
|
31 |
+
metadata: Dict[str, Any] = field(default_factory=dict)
|
32 |
+
|
33 |
+
@classmethod
|
34 |
+
def user(cls, content: str) -> "Message":
|
35 |
+
return cls("user", content, '')
|
36 |
+
|
37 |
+
@classmethod
|
38 |
+
def system(cls, content: str) -> "Message":
|
39 |
+
return cls("system", content, '')
|
40 |
+
|
41 |
+
@classmethod
|
42 |
+
def tool(cls, content: str, **kwargs) -> "Message":
|
43 |
+
return cls("assistant", content, 'tool', kwargs)
|
44 |
+
|
45 |
+
@classmethod
|
46 |
+
def assistant(cls, content: str, mode='') -> "Message":
|
47 |
+
return cls("assistant", content, mode)
|
48 |
+
|
49 |
+
@classmethod
|
50 |
+
def tts(cls, content: str) -> "Message":
|
51 |
+
return cls("assistant", content, 'tts')
|
52 |
+
|
53 |
+
def to_dict(self) -> Dict[str, Any]:
|
54 |
+
result = {"role": self.role, "content": self.content}
|
55 |
+
if self.mode == "tool":
|
56 |
+
metadata = self.metadata.copy()
|
57 |
+
if title := metadata.get("title"):
|
58 |
+
metadata["title"] = title.title()
|
59 |
+
result["metadata"] = metadata
|
60 |
+
return result
|
61 |
+
|
62 |
+
|
63 |
+
@dataclass
|
64 |
+
class Snapshot:
|
65 |
+
sender: str
|
66 |
+
data: Any
|
67 |
+
|
68 |
+
@property
|
69 |
+
def gr(self):
|
70 |
+
if isinstance(self.data, np.ndarray):
|
71 |
+
return gr.Image(self.data)
|
72 |
+
return self.data
|
73 |
+
|
74 |
+
|
75 |
+
class RunnerLoggerHooks(RunHooks):
|
76 |
+
"""Custom hooks to log every step of the Runner"""
|
77 |
+
|
78 |
+
def __init__(self, memory_instance):
|
79 |
+
super().__init__()
|
80 |
+
self.memory = memory_instance
|
81 |
+
self.current_turn = 0
|
82 |
+
self.turn_start_time = None
|
83 |
+
|
84 |
+
async def on_agent_start(self, context, agent):
|
85 |
+
self.current_turn += 1
|
86 |
+
self.turn_start_time = time.time()
|
87 |
+
|
88 |
+
step = RunnerStep(
|
89 |
+
timestamp=datetime.now().isoformat(),
|
90 |
+
step_type="turn_start",
|
91 |
+
agent_name=agent.name,
|
92 |
+
turn_number=self.current_turn,
|
93 |
+
details={"message": f"Starting turn {self.current_turn} with agent {agent.name}"}
|
94 |
+
)
|
95 |
+
self.memory.log_runner_step(step)
|
96 |
+
|
97 |
+
async def on_agent_end(self, context, agent, result):
|
98 |
+
if self.turn_start_time:
|
99 |
+
duration = (time.time() - self.turn_start_time) * 1000
|
100 |
+
else:
|
101 |
+
duration = None
|
102 |
+
|
103 |
+
step = RunnerStep(
|
104 |
+
timestamp=datetime.now().isoformat(),
|
105 |
+
step_type="agent_call",
|
106 |
+
agent_name=agent.name,
|
107 |
+
turn_number=self.current_turn,
|
108 |
+
details={"message": f"Agent {agent.name} completed", "result_type": type(result).__name__},
|
109 |
+
duration_ms=duration
|
110 |
+
)
|
111 |
+
self.memory.log_runner_step(step)
|
112 |
+
|
113 |
+
async def on_tool_start(self, context, agent, tool_call):
|
114 |
+
tool_name = getattr(tool_call, 'name', 'unknown')
|
115 |
+
tool_args = None
|
116 |
+
for attr in ['arguments', 'args', 'function', 'parameters']:
|
117 |
+
if hasattr(tool_call, attr):
|
118 |
+
tool_args = getattr(tool_call, attr)
|
119 |
+
break
|
120 |
+
step = RunnerStep(
|
121 |
+
timestamp=datetime.now().isoformat(),
|
122 |
+
step_type="tool_call",
|
123 |
+
agent_name=agent.name,
|
124 |
+
turn_number=self.current_turn,
|
125 |
+
details={
|
126 |
+
"tool_name": tool_name,
|
127 |
+
"tool_args": tool_args,
|
128 |
+
"message": f"Calling tool {tool_name}"
|
129 |
+
}
|
130 |
+
)
|
131 |
+
self.memory.log_runner_step(step)
|
132 |
+
|
133 |
+
async def on_tool_end(self, context, agent, tool_call, result):
|
134 |
+
# Handle different tool_call object attributes safely
|
135 |
+
tool_name = getattr(tool_call, 'name', 'unknown')
|
136 |
+
|
137 |
+
step = RunnerStep(
|
138 |
+
timestamp=datetime.now().isoformat(),
|
139 |
+
step_type="tool_result",
|
140 |
+
agent_name=agent.name,
|
141 |
+
turn_number=self.current_turn,
|
142 |
+
details={
|
143 |
+
"tool_name": tool_name,
|
144 |
+
"result_length": len(str(result)) if result else 0,
|
145 |
+
"message": f"Tool {tool_name} completed"
|
146 |
+
}
|
147 |
+
)
|
148 |
+
self.memory.log_runner_step(step)
|
149 |
+
|
150 |
+
|
151 |
+
class Chat:
|
152 |
+
def __init__(self):
|
153 |
+
self.history = []
|
154 |
+
|
155 |
+
def append(self, message: Message):
|
156 |
+
self.history.append(message)
|
157 |
+
|
158 |
+
@property
|
159 |
+
def messages(self):
|
160 |
+
return [i.to_dict() for i in self.history]
|
161 |
+
|
162 |
+
|
163 |
+
class Memory:
|
164 |
+
def __init__(self, agent, limit: int = 200) -> None:
|
165 |
+
self.limit: int = limit
|
166 |
+
self.frames: list[Any] = []
|
167 |
+
self.snapshots: list[Any] = []
|
168 |
+
self.inputs: list[Any] = []
|
169 |
+
self.chat = Chat()
|
170 |
+
|
171 |
+
self.runner_steps: List[RunnerStep] = []
|
172 |
+
self.step_limit: int = 1000 # Keep last 1000 steps
|
173 |
+
self.logger_hooks: Optional[RunnerLoggerHooks] = None
|
174 |
+
|
175 |
+
self._chat_q: asyncio.Queue[Any] = asyncio.Queue()
|
176 |
+
self._input_q: asyncio.Queue[Any] = asyncio.Queue()
|
177 |
+
self._loop: Optional[asyncio.AbstractEventLoop] = None
|
178 |
+
self.is_waiting: bool = False
|
179 |
+
self.is_running: bool = False
|
180 |
+
self._last_frame_time: float = 0
|
181 |
+
self.setup(agent)
|
182 |
+
|
183 |
+
def log_runner_step(self, step: RunnerStep) -> None:
|
184 |
+
"""Log a runner step and maintain the step history limit"""
|
185 |
+
self.runner_steps.append(step)
|
186 |
+
logger.debug(f"[ 🛠️ ]{step}")
|
187 |
+
while len(self.runner_steps) > self.step_limit:
|
188 |
+
self.runner_steps.pop(0)
|
189 |
+
|
190 |
+
def enqueue(self, data: Any) -> None:
|
191 |
+
current_time = time.time()
|
192 |
+
if current_time-self._last_frame_time > 1.0 / env.fps:
|
193 |
+
self._last_frame_time = current_time
|
194 |
+
self.frames.append(data)
|
195 |
+
while len(self.frames) > self.limit:
|
196 |
+
self.frames.pop(0)
|
197 |
+
return self.snapshots.pop(0) if self.snapshots else None
|
198 |
+
|
199 |
+
def receive(self, text: str) -> None:
|
200 |
+
self.chat.append(Message.user(text))
|
201 |
+
self._loop.call_soon_threadsafe(self._chat_q.put_nowait, text)
|
202 |
+
|
203 |
+
|
204 |
+
def setup(self, agent) -> None:
|
205 |
+
"""Bind *agent* and spawn the background monitor threads."""
|
206 |
+
self.v_agent = agent
|
207 |
+
self.logger_hooks = RunnerLoggerHooks(self)
|
208 |
+
def _runner() -> None:
|
209 |
+
self._loop = asyncio.new_event_loop()
|
210 |
+
asyncio.set_event_loop(self._loop)
|
211 |
+
try:
|
212 |
+
self._loop.create_task(self._monitor_chat())
|
213 |
+
self._loop.run_forever()
|
214 |
+
finally:
|
215 |
+
self._loop.close()
|
216 |
+
|
217 |
+
threading.Thread(target=_runner, daemon=True).start()
|
218 |
+
|
219 |
+
async def _monitor_chat(self) -> None:
|
220 |
+
"""Process incoming chat messages, respecting the waiting gate."""
|
221 |
+
while True:
|
222 |
+
text = await self._chat_q.get()
|
223 |
+
logger.debug(f"Processing: {text}")
|
224 |
+
start_step = RunnerStep(
|
225 |
+
timestamp=datetime.now().isoformat(),
|
226 |
+
step_type="processing_start",
|
227 |
+
agent_name=getattr(self.v_agent, 'name', 'unknown'),
|
228 |
+
turn_number=0,
|
229 |
+
details={"user_input": text}
|
230 |
+
)
|
231 |
+
self.log_runner_step(start_step)
|
232 |
+
|
233 |
+
try:
|
234 |
+
self.is_running = True
|
235 |
+
result = await Runner.run(
|
236 |
+
starting_agent=self.v_agent,
|
237 |
+
input=text,
|
238 |
+
context=self,
|
239 |
+
hooks=self.logger_hooks # Add our custom hooks here
|
240 |
+
)
|
241 |
+
|
242 |
+
self.is_running = False
|
243 |
+
|
244 |
+
# Log successful completion
|
245 |
+
success_step = RunnerStep(
|
246 |
+
timestamp=datetime.now().isoformat(),
|
247 |
+
step_type="final_output",
|
248 |
+
agent_name=getattr(self.v_agent, 'name', 'unknown'),
|
249 |
+
turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
|
250 |
+
details={
|
251 |
+
"output_type": type(result.final_output).__name__,
|
252 |
+
"output_preview": str(result.final_output)[:100] + "..." if len(str(result.final_output)) > 100 else str(result.final_output)
|
253 |
+
}
|
254 |
+
)
|
255 |
+
self.log_runner_step(success_step)
|
256 |
+
|
257 |
+
except Exception as exc: # noqa: BLE001
|
258 |
+
self.is_running = False
|
259 |
+
full_traceback = traceback.format_exc()
|
260 |
+
logger.debug(f"Error in _monitor_chat: {exc}\n{full_traceback}")
|
261 |
+
|
262 |
+
# Log the error
|
263 |
+
error_step = RunnerStep(
|
264 |
+
timestamp=datetime.now().isoformat(),
|
265 |
+
step_type="error",
|
266 |
+
agent_name=getattr(self.v_agent, 'name', 'unknown'),
|
267 |
+
turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
|
268 |
+
details={
|
269 |
+
"error_type": type(exc).__name__,
|
270 |
+
"error_message": str(exc),
|
271 |
+
"traceback": full_traceback
|
272 |
+
}
|
273 |
+
)
|
274 |
+
self.log_runner_step(error_step)
|
275 |
+
continue
|
276 |
+
final = result.final_output.split('</think>', 1)[-1]
|
277 |
+
self.chat.append(Message.assistant(final))
|
278 |
+
await asyncio.sleep(0)
|
app/tool.py
ADDED
@@ -0,0 +1,184 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import datetime
|
2 |
+
import json
|
3 |
+
import cv2
|
4 |
+
import httpx
|
5 |
+
from app.config import env
|
6 |
+
from app.utils import image_w_box, encode_image
|
7 |
+
from agents import RunContextWrapper, function_tool
|
8 |
+
from app.memory import Memory,Snapshot
|
9 |
+
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
def task(name, image):
|
14 |
+
resp = httpx.post(f"{env.end_task}",
|
15 |
+
data={"name": name},
|
16 |
+
files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")},
|
17 |
+
timeout=10,
|
18 |
+
headers={"Authorization": env.api_key},
|
19 |
+
)
|
20 |
+
resp.raise_for_status()
|
21 |
+
return resp.json()['result']
|
22 |
+
|
23 |
+
def completion(messages, model):
|
24 |
+
response = env.client.chat.completions.create(
|
25 |
+
model=model,
|
26 |
+
messages=messages
|
27 |
+
)
|
28 |
+
return response.choices[0].message.content
|
29 |
+
|
30 |
+
|
31 |
+
def completion_image(images, prompt, model):
|
32 |
+
messages = [
|
33 |
+
{
|
34 |
+
"role": "user",
|
35 |
+
"content": [
|
36 |
+
{"type": "text", "text": prompt},
|
37 |
+
{"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
|
38 |
+
],
|
39 |
+
}
|
40 |
+
for b64, mime in map(encode_image, images)
|
41 |
+
]
|
42 |
+
return completion(messages, model=model)
|
43 |
+
|
44 |
+
# ------------------------ Function Tools ------------------------
|
45 |
+
@function_tool
|
46 |
+
def caption(wrapper: RunContextWrapper[Memory]) -> str:
|
47 |
+
"""
|
48 |
+
Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it.
|
49 |
+
Returns:
|
50 |
+
str:
|
51 |
+
The generated caption for the current view (i.e., the latest frame).
|
52 |
+
"""
|
53 |
+
mem = wrapper.context
|
54 |
+
prompt = "Describe the image with rich details but in a concise manner."
|
55 |
+
result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
|
56 |
+
mem.snapshots.append(Snapshot(sender='caption', data=result))
|
57 |
+
return result
|
58 |
+
|
59 |
+
@function_tool
|
60 |
+
def ocr(wrapper: RunContextWrapper[Memory]) -> str:
|
61 |
+
"""
|
62 |
+
Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text.
|
63 |
+
Returns:
|
64 |
+
str:
|
65 |
+
The extracted text from the current view (i.e., the latest frame).
|
66 |
+
"""
|
67 |
+
mem = wrapper.context
|
68 |
+
prompt = "Extract all text from image/payslip without miss anything."
|
69 |
+
result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
|
70 |
+
mem.snapshots.append(Snapshot(sender='ocr', data=result))
|
71 |
+
return result
|
72 |
+
|
73 |
+
@function_tool
|
74 |
+
def qa(wrapper: RunContextWrapper[Memory], question: str) -> str:
|
75 |
+
"""
|
76 |
+
Answer a question based on the most recent frame, record it as a snapshot, and return the answer.
|
77 |
+
|
78 |
+
Args:
|
79 |
+
question (str): The question to be answered.
|
80 |
+
Returns:
|
81 |
+
str:
|
82 |
+
The answer to the question based on the current view (i.e., the latest frame).
|
83 |
+
"""
|
84 |
+
mem = wrapper.context
|
85 |
+
prompt = f"Answer the question based on the image. Question: {question}"
|
86 |
+
result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
|
87 |
+
mem.snapshots.append(Snapshot(sender='qa', data=result))
|
88 |
+
return result
|
89 |
+
|
90 |
+
|
91 |
+
@function_tool
|
92 |
+
def localize(wrapper: RunContextWrapper[Memory]) -> str:
|
93 |
+
"""
|
94 |
+
Localize all objects in the most recent frame
|
95 |
+
Returns:
|
96 |
+
str:
|
97 |
+
The localization result for the current view (i.e., the latest frame).
|
98 |
+
the format is {name:list of bboxes}
|
99 |
+
"""
|
100 |
+
mem = wrapper.context
|
101 |
+
frame = mem.frames[-1]
|
102 |
+
_, img = cv2.imencode('.jpg', frame)
|
103 |
+
objxbox = task(env.model_loc, img)
|
104 |
+
mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox)))
|
105 |
+
return json.dumps(objxbox, indent=2)
|
106 |
+
|
107 |
+
|
108 |
+
@function_tool
|
109 |
+
def time(wrapper: RunContextWrapper[Memory]) -> str:
|
110 |
+
"""
|
111 |
+
Get the current time, record it as a snapshot, and return the time.
|
112 |
+
Returns:
|
113 |
+
str:
|
114 |
+
The current time.
|
115 |
+
"""
|
116 |
+
mem = wrapper.context
|
117 |
+
result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
|
118 |
+
mem.snapshots.append(Snapshot(sender='time', data=result))
|
119 |
+
return result
|
120 |
+
|
121 |
+
def sample_frames(mem: Memory, n: int) -> list:
|
122 |
+
"""
|
123 |
+
Sample frames from the past n seconds of video.
|
124 |
+
|
125 |
+
Args:
|
126 |
+
mem (Memory): The memory context containing frames.
|
127 |
+
n (int): Number of seconds to look back for video frames.
|
128 |
+
Returns:
|
129 |
+
list: Sampled frames from the video sequence.
|
130 |
+
"""
|
131 |
+
if len(mem.frames) == 0:
|
132 |
+
return []
|
133 |
+
|
134 |
+
available_frames = min(n * env.fps, len(mem.frames))
|
135 |
+
recent_frames = mem.frames[-available_frames:]
|
136 |
+
sampled_frames = recent_frames[::env.fps // 2]
|
137 |
+
|
138 |
+
return sampled_frames
|
139 |
+
|
140 |
+
@function_tool
|
141 |
+
def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str:
|
142 |
+
"""
|
143 |
+
Generate a descriptive caption for a video sequence from the past n seconds of frames.
|
144 |
+
The n is a required parameter that specifies how many seconds of video frames to consider.
|
145 |
+
|
146 |
+
Args:
|
147 |
+
n (int): Number of seconds to look back for video frames.
|
148 |
+
Returns:
|
149 |
+
str:
|
150 |
+
The generated caption for the video sequence from the past n seconds.
|
151 |
+
"""
|
152 |
+
mem = wrapper.context
|
153 |
+
sampled_frames = sample_frames(mem, n)
|
154 |
+
|
155 |
+
if len(sampled_frames) == 0:
|
156 |
+
return "No frames available for video caption."
|
157 |
+
|
158 |
+
prompt = "Describe this video sequence focusing on any changes or actions that occur over time."
|
159 |
+
result = completion_image(sampled_frames, prompt, env.model_mllm)
|
160 |
+
mem.snapshots.append(Snapshot(sender='video caption', data=result))
|
161 |
+
return result
|
162 |
+
|
163 |
+
@function_tool
|
164 |
+
def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str:
|
165 |
+
"""
|
166 |
+
Answer a question based on a video sequence from the past n seconds of frames.
|
167 |
+
|
168 |
+
Args:
|
169 |
+
question (str): The question to be answered.
|
170 |
+
n (int): Number of seconds to look back for video frames.
|
171 |
+
Returns:
|
172 |
+
str:
|
173 |
+
The answer to the question based on the video sequence from the past n seconds.
|
174 |
+
"""
|
175 |
+
mem = wrapper.context
|
176 |
+
sampled_frames = sample_frames(mem, n)
|
177 |
+
|
178 |
+
if len(sampled_frames) == 0:
|
179 |
+
return "No frames available for video Q&A."
|
180 |
+
|
181 |
+
prompt = f"Answer the question based on this video sequence. Question: {question}"
|
182 |
+
result = completion_image(sampled_frames, prompt, env.model_mllm)
|
183 |
+
mem.snapshots.append(Snapshot(sender='video qa', data=result))
|
184 |
+
return result
|
app/utils.py
ADDED
@@ -0,0 +1,72 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from PIL import Image
|
2 |
+
import base64
|
3 |
+
import supervision as sv
|
4 |
+
import numpy as np
|
5 |
+
import cv2
|
6 |
+
colors = sv.ColorPalette.from_hex(
|
7 |
+
[
|
8 |
+
"#a1c9f4",
|
9 |
+
"#ffb482",
|
10 |
+
"#8de5a1",
|
11 |
+
"#ff9f9b",
|
12 |
+
"#d0bbff",
|
13 |
+
"#debb9b",
|
14 |
+
"#fab0e4",
|
15 |
+
"#cfcfcf",
|
16 |
+
"#fffea3",
|
17 |
+
"#b9f2f0",
|
18 |
+
"#a1c9f4",
|
19 |
+
"#ffb482",
|
20 |
+
"#8de5a1",
|
21 |
+
"#ff9f9b",
|
22 |
+
"#d0bbff",
|
23 |
+
"#debb9b",
|
24 |
+
"#fab0e4",
|
25 |
+
"#cfcfcf",
|
26 |
+
"#fffea3",
|
27 |
+
"#b9f2f0",
|
28 |
+
]
|
29 |
+
)
|
30 |
+
|
31 |
+
def image_w_box(image,objxbox):
|
32 |
+
|
33 |
+
box_annotator = sv.BoxCornerAnnotator(thickness=10, corner_length=30, color=colors)
|
34 |
+
label_annotator = sv.LabelAnnotator(color=colors)
|
35 |
+
mask_annotator = sv.MaskAnnotator(opacity=0.2, color=colors)
|
36 |
+
|
37 |
+
xyxys = np.array([v for boxes in objxbox.values() for v in boxes])
|
38 |
+
unique_labels = sorted(objxbox.keys())
|
39 |
+
class_id_map = dict(enumerate(unique_labels))
|
40 |
+
labels = [l for l, boxes in objxbox.items() for _ in boxes]
|
41 |
+
class_id = [list(class_id_map.values()).index(label) for label in labels]
|
42 |
+
|
43 |
+
masks = np.zeros((len(xyxys), image.shape[0], image.shape[1]), dtype=bool)
|
44 |
+
for i, (x1, y1, x2, y2) in enumerate(xyxys):
|
45 |
+
masks[i, int(y1):int(y2), int(x1):int(x2)] = labels[i]
|
46 |
+
|
47 |
+
if len(xyxys) == 0:
|
48 |
+
return image
|
49 |
+
detections = sv.Detections(
|
50 |
+
xyxy=xyxys,
|
51 |
+
mask=masks,
|
52 |
+
class_id=np.array(class_id),
|
53 |
+
)
|
54 |
+
# Convert RGB to BGR for annotation
|
55 |
+
image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
|
56 |
+
# After annotation, convert back to RGB
|
57 |
+
annotated_image = box_annotator.annotate(scene=image_bgr.copy(), detections=detections)
|
58 |
+
annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
|
59 |
+
annotated_image = mask_annotator.annotate(scene=annotated_image, detections=detections)
|
60 |
+
|
61 |
+
return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
|
62 |
+
|
63 |
+
|
64 |
+
def encode_image(img) -> tuple[str, str]:
|
65 |
+
arr = np.array(img.convert("RGB")) if isinstance(img, Image.Image) else img
|
66 |
+
if not isinstance(arr, np.ndarray):
|
67 |
+
raise ValueError("Unsupported image type")
|
68 |
+
ok, buf = cv2.imencode('.jpg', cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
|
69 |
+
if not ok:
|
70 |
+
raise ValueError("Encoding failed")
|
71 |
+
b64 = base64.b64encode(buf).decode('utf-8')
|
72 |
+
return b64, "image/jpeg"
|
main.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import os
|
3 |
+
import cv2
|
4 |
+
import gradio as gr
|
5 |
+
from fastrtc import Stream,WebRTC
|
6 |
+
from app.config import env
|
7 |
+
from fastrtc import AdditionalOutputs
|
8 |
+
from app.memory import Memory,Message
|
9 |
+
from fastrtc import get_cloudflare_turn_credentials
|
10 |
+
from app.agent import build_agent
|
11 |
+
from fastrtc import get_current_context
|
12 |
+
session_memories = {}
|
13 |
+
|
14 |
+
def get_session_memory(session_id: str = None) -> Memory:
|
15 |
+
if session_id not in session_memories:
|
16 |
+
session_memories[session_id] = Memory(build_agent())
|
17 |
+
welcome_message = "👋 Now I can see. Feel free to ask me about anything!"
|
18 |
+
session_memories[session_id].chat.append(Message.assistant(welcome_message))
|
19 |
+
return session_memories[session_id]
|
20 |
+
|
21 |
+
def video_handler(frame):
|
22 |
+
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
|
23 |
+
rtcid = get_current_context().webrtc_id
|
24 |
+
mem = get_session_memory(rtcid)
|
25 |
+
if (snapshot := mem.enqueue(frame)):
|
26 |
+
mem.chat.append(Message.tool(snapshot.gr, title=snapshot.sender, status='done'))
|
27 |
+
return frame, AdditionalOutputs(mem.chat.messages, rtcid)
|
28 |
+
|
29 |
+
def chat_handler(text, webrtc_state):
|
30 |
+
if webrtc_state is None:
|
31 |
+
return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state
|
32 |
+
|
33 |
+
mem = get_session_memory(webrtc_state)
|
34 |
+
if not mem.is_running:
|
35 |
+
mem.receive(text.strip())
|
36 |
+
return "", mem.chat.messages, webrtc_state
|
37 |
+
|
38 |
+
|
39 |
+
|
40 |
+
|
41 |
+
|
42 |
+
if __name__ == "__main__":
|
43 |
+
print("🚀 Starting Perceptual Copilot...")
|
44 |
+
print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
|
45 |
+
print(f"Environment check - API_KEY: {'✓' if env.api_key else '✗'}")
|
46 |
+
print(f"Environment check - END_LANG: {'✓' if env.end_lang else '✗'}")
|
47 |
+
print(f"Environment check - OpenAI Client: {'✓' if env.client else '✗'}")
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
with gr.Blocks(
|
52 |
+
title="🤖 Perceptual Copilot - AI Vision Assistant",
|
53 |
+
theme=gr.themes.Soft(
|
54 |
+
primary_hue="blue",
|
55 |
+
secondary_hue="orange",
|
56 |
+
neutral_hue="slate",
|
57 |
+
font=("system-ui", "sans-serif")
|
58 |
+
),
|
59 |
+
css=Path("styles.css").read_text(),
|
60 |
+
) as demo:
|
61 |
+
|
62 |
+
# Header section with sleek styling
|
63 |
+
gr.Markdown("""
|
64 |
+
<div class="ultra-sleek-header">
|
65 |
+
<h1 class="hero-title">
|
66 |
+
<span class="title-primary">Perceptual</span>
|
67 |
+
<span class="title-accent">Copilot</span>
|
68 |
+
</h1>
|
69 |
+
<p class="hero-subtitle">
|
70 |
+
<span class="status-dot"></span>
|
71 |
+
An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams.
|
72 |
+
</p>
|
73 |
+
<div class="feature-pills">
|
74 |
+
<span class="pill">Real-time streaming</span>
|
75 |
+
<span class="pill">Visual Agent</span>
|
76 |
+
<span class="pill">Large vision language model</span>
|
77 |
+
<span class="pill">Reasoning</span>
|
78 |
+
</div>
|
79 |
+
</div>
|
80 |
+
""", elem_classes="ultra-sleek-header")
|
81 |
+
|
82 |
+
state = gr.State(value=None)
|
83 |
+
|
84 |
+
# Main interface with improved layout
|
85 |
+
with gr.Row(equal_height=True):
|
86 |
+
with gr.Column(scale=1, elem_classes="video-container"):
|
87 |
+
video = WebRTC(
|
88 |
+
label="🎥 Camera Stream",
|
89 |
+
rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
|
90 |
+
track_constraints={
|
91 |
+
"width": {"exact": 600},
|
92 |
+
"height": {"exact": 600},
|
93 |
+
"aspectRatio": {"exact": 1}},
|
94 |
+
mode="send",
|
95 |
+
modality="video",
|
96 |
+
mirror_webcam=True,
|
97 |
+
width=600,
|
98 |
+
height=600,
|
99 |
+
)
|
100 |
+
|
101 |
+
with gr.Column(scale=1, elem_classes="chat-container"):
|
102 |
+
gr.Markdown("### 💬 Chat")
|
103 |
+
chatbot = gr.Chatbot(
|
104 |
+
type="messages",
|
105 |
+
height=450,
|
106 |
+
label="🤖 AI Assistant",
|
107 |
+
placeholder="Chat history will appear here...",
|
108 |
+
show_label=False,
|
109 |
+
)
|
110 |
+
|
111 |
+
with gr.Row(elem_classes="items-center"):
|
112 |
+
textbox = gr.Textbox(
|
113 |
+
placeholder="💭 Question goes here, press ENTER to send",
|
114 |
+
lines=1,
|
115 |
+
show_label=False,
|
116 |
+
)
|
117 |
+
# Event handlers
|
118 |
+
video.stream(
|
119 |
+
fn=video_handler,
|
120 |
+
inputs=[video],
|
121 |
+
outputs=[video],
|
122 |
+
concurrency_limit=10,
|
123 |
+
)
|
124 |
+
video.on_additional_outputs(
|
125 |
+
fn=lambda messages, webrtc_id: (messages, webrtc_id),
|
126 |
+
outputs=[chatbot, state]
|
127 |
+
)
|
128 |
+
|
129 |
+
# Chat handler for textbox
|
130 |
+
textbox.submit(
|
131 |
+
chat_handler,
|
132 |
+
inputs=[textbox, state],
|
133 |
+
outputs=[textbox, chatbot, state]
|
134 |
+
)
|
135 |
+
|
136 |
+
# Enhanced instructions section
|
137 |
+
with gr.Column(elem_classes="instructions-container"):
|
138 |
+
gr.Markdown("""
|
139 |
+
## 🚀 Get Started
|
140 |
+
|
141 |
+
**📌 Quick Reminder:**
|
142 |
+
1. Allow camera access when prompted
|
143 |
+
2. Wait for the camera to initialize and first message to appear
|
144 |
+
3. 💡 **Tip:** If you find it hard to see the interface, please turn off night mode for better visibility
|
145 |
+
""")
|
146 |
+
|
147 |
+
with gr.Row():
|
148 |
+
with gr.Column():
|
149 |
+
gr.Markdown("""
|
150 |
+
### 💡 Example Prompts
|
151 |
+
|
152 |
+
**🌍 General Vision:**
|
153 |
+
- *"What do you see in front of me?"*
|
154 |
+
- *"What's the overall environment like?"*
|
155 |
+
|
156 |
+
**📄 Text & Documents:**
|
157 |
+
- *"Read the text in this document"*
|
158 |
+
- *"Extract the code snippet from this image"*
|
159 |
+
|
160 |
+
**🔍 Object Recognition:**
|
161 |
+
- *"What objects are visible?"*
|
162 |
+
- *"Help me identify this item"*
|
163 |
+
""")
|
164 |
+
|
165 |
+
with gr.Column():
|
166 |
+
gr.Markdown("""
|
167 |
+
### 🔧 Current Capabilities
|
168 |
+
|
169 |
+
**🚀 Available Features:**
|
170 |
+
- **OCR** - Text extraction and reading
|
171 |
+
- **Q&A** - Visual question answering
|
172 |
+
- **Caption** - Scene description and analysis
|
173 |
+
- **Localization** - Object detection and positioning
|
174 |
+
- **Time** - Current time and temporal context
|
175 |
+
|
176 |
+
**📈 More Coming Soon:**
|
177 |
+
We're continuously adding new capabilities to enhance your visual AI experience.
|
178 |
+
|
179 |
+
**⚠️ Important Note:**
|
180 |
+
All models are self-hosted. Please avoid abuse of the system.
|
181 |
+
""")
|
182 |
+
demo.queue(default_concurrency_limit=None)
|
183 |
+
demo.launch()
|
requirements.txt
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
huggingface_hub
|
2 |
+
python-dotenv
|
3 |
+
supervision
|
4 |
+
openai-agents
|
5 |
+
fastrtc==0.0.21
|
6 |
+
gradio
|
7 |
+
pydantic==2.10.6
|
styles.css
ADDED
@@ -0,0 +1,391 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.gradio-container {
|
2 |
+
background: linear-gradient(135deg, #fefefe 0%, #f8f6f0 100%) !important;
|
3 |
+
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
|
4 |
+
}
|
5 |
+
|
6 |
+
.main-header {
|
7 |
+
background: rgba(255, 255, 255, 0.95);
|
8 |
+
border-radius: 15px;
|
9 |
+
padding: 20px;
|
10 |
+
margin: 10px 0;
|
11 |
+
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
|
12 |
+
backdrop-filter: blur(10px);
|
13 |
+
border: 1px solid rgba(255, 255, 255, 0.2);
|
14 |
+
}
|
15 |
+
|
16 |
+
.video-container {
|
17 |
+
background: rgba(255, 255, 255, 0.9);
|
18 |
+
border-radius: 12px;
|
19 |
+
padding: 20px;
|
20 |
+
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
|
21 |
+
backdrop-filter: blur(8px);
|
22 |
+
border: 1px solid rgba(255, 255, 255, 0.3);
|
23 |
+
}
|
24 |
+
|
25 |
+
.chat-container {
|
26 |
+
background: rgba(255, 255, 255, 0.9);
|
27 |
+
border-radius: 12px;
|
28 |
+
padding: 20px;
|
29 |
+
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
|
30 |
+
backdrop-filter: blur(8px);
|
31 |
+
border: 1px solid rgba(255, 255, 255, 0.3);
|
32 |
+
}
|
33 |
+
|
34 |
+
.chat-container textarea {
|
35 |
+
border: 3px solid #4285f4 !important;
|
36 |
+
border-radius: 12px !important;
|
37 |
+
padding: 15px !important;
|
38 |
+
font-size: 16px !important;
|
39 |
+
font-weight: 500 !important;
|
40 |
+
background: #ffffff !important;
|
41 |
+
box-shadow: 0 4px 20px rgba(66, 133, 244, 0.15) !important;
|
42 |
+
transition: all 0.3s ease !important;
|
43 |
+
min-height: 50px !important;
|
44 |
+
display: flex !important;
|
45 |
+
align-items: center !important;
|
46 |
+
justify-content: center !important;
|
47 |
+
resize: none !important;
|
48 |
+
}
|
49 |
+
|
50 |
+
.chat-container textarea:focus {
|
51 |
+
border-color: #1a73e8 !important;
|
52 |
+
box-shadow: 0 6px 25px rgba(66, 133, 244, 0.3), 0 0 0 3px rgba(66, 133, 244, 0.1) !important;
|
53 |
+
outline: none !important;
|
54 |
+
transform: translateY(-2px) !important;
|
55 |
+
}
|
56 |
+
|
57 |
+
.chat-container textarea::placeholder {
|
58 |
+
color: #5f6368 !important;
|
59 |
+
font-weight: 400 !important;
|
60 |
+
font-size: 15px !important;
|
61 |
+
}
|
62 |
+
|
63 |
+
.chat-container .gr-text-input {
|
64 |
+
border: none !important;
|
65 |
+
background: transparent !important;
|
66 |
+
}
|
67 |
+
|
68 |
+
.instructions-container {
|
69 |
+
background: rgba(255, 255, 255, 0.9);
|
70 |
+
border-radius: 12px;
|
71 |
+
padding: 25px;
|
72 |
+
margin: 20px 0;
|
73 |
+
box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
|
74 |
+
backdrop-filter: blur(8px);
|
75 |
+
border: 1px solid rgba(255, 255, 255, 0.3);
|
76 |
+
}
|
77 |
+
|
78 |
+
.feature-card {
|
79 |
+
background: linear-gradient(135deg, #ff6b6b, #feca57);
|
80 |
+
border-radius: 10px;
|
81 |
+
padding: 15px;
|
82 |
+
margin: 10px 0;
|
83 |
+
color: white;
|
84 |
+
box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
|
85 |
+
}
|
86 |
+
|
87 |
+
.status-indicator {
|
88 |
+
display: inline-block;
|
89 |
+
width: 10px;
|
90 |
+
height: 10px;
|
91 |
+
background: #4CAF50;
|
92 |
+
border-radius: 50%;
|
93 |
+
margin-right: 8px;
|
94 |
+
animation: pulse 2s infinite;
|
95 |
+
}
|
96 |
+
|
97 |
+
@keyframes pulse {
|
98 |
+
0% { opacity: 1; }
|
99 |
+
50% { opacity: 0.5; }
|
100 |
+
100% { opacity: 1; }
|
101 |
+
}
|
102 |
+
|
103 |
+
.title-emoji {
|
104 |
+
font-size: 2em;
|
105 |
+
margin-right: 10px;
|
106 |
+
vertical-align: middle;
|
107 |
+
}
|
108 |
+
|
109 |
+
/* Hide Gradio branding and footer elements */
|
110 |
+
.gradio-container footer {
|
111 |
+
display: none !important;
|
112 |
+
}
|
113 |
+
|
114 |
+
/* Hide "Use via API" button */
|
115 |
+
.gradio-container .api-docs {
|
116 |
+
display: none !important;
|
117 |
+
}
|
118 |
+
|
119 |
+
/* Hide "Built with Gradio" logo */
|
120 |
+
.gradio-container .built-with {
|
121 |
+
display: none !important;
|
122 |
+
}
|
123 |
+
|
124 |
+
/* Hide Settings button */
|
125 |
+
.gradio-container .settings {
|
126 |
+
display: none !important;
|
127 |
+
}
|
128 |
+
|
129 |
+
/* Hide the entire footer area */
|
130 |
+
.gradio-container .footer {
|
131 |
+
display: none !important;
|
132 |
+
}
|
133 |
+
|
134 |
+
/* Alternative selectors for different Gradio versions */
|
135 |
+
.gradio-container div[data-testid="footer"] {
|
136 |
+
display: none !important;
|
137 |
+
}
|
138 |
+
|
139 |
+
.gradio-container .gradio-footer {
|
140 |
+
display: none !important;
|
141 |
+
}
|
142 |
+
|
143 |
+
/* Hide any elements containing gradio branding text */
|
144 |
+
.gradio-container a[href*="gradio"] {
|
145 |
+
display: none !important;
|
146 |
+
}
|
147 |
+
|
148 |
+
.gradio-container div:has-text("Built with") {
|
149 |
+
display: none !important;
|
150 |
+
}
|
151 |
+
|
152 |
+
.gradio-container div:has-text("Use via API") {
|
153 |
+
display: none !important;
|
154 |
+
}
|
155 |
+
|
156 |
+
/* Ultra-sleek header styling with modern design */
|
157 |
+
.ultra-sleek-header {
|
158 |
+
text-align: center;
|
159 |
+
padding: 20px 20px 5px 20px;
|
160 |
+
margin: 0;
|
161 |
+
background: transparent;
|
162 |
+
border: none;
|
163 |
+
position: relative;
|
164 |
+
overflow: hidden;
|
165 |
+
}
|
166 |
+
|
167 |
+
.ultra-sleek-header::before {
|
168 |
+
content: '';
|
169 |
+
position: absolute;
|
170 |
+
top: 0;
|
171 |
+
left: 0;
|
172 |
+
right: 0;
|
173 |
+
bottom: 0;
|
174 |
+
background: radial-gradient(ellipse at center top, rgba(99, 102, 241, 0.05) 0%, transparent 70%);
|
175 |
+
pointer-events: none;
|
176 |
+
}
|
177 |
+
|
178 |
+
|
179 |
+
.badge-icon {
|
180 |
+
font-size: 1rem;
|
181 |
+
animation: float 3s ease-in-out infinite;
|
182 |
+
}
|
183 |
+
|
184 |
+
@keyframes float {
|
185 |
+
|
186 |
+
0%,
|
187 |
+
100% {
|
188 |
+
transform: translateY(0px);
|
189 |
+
}
|
190 |
+
|
191 |
+
50% {
|
192 |
+
transform: translateY(-4px);
|
193 |
+
}
|
194 |
+
}
|
195 |
+
|
196 |
+
.hero-title {
|
197 |
+
font-size: 4rem;
|
198 |
+
font-weight: 800;
|
199 |
+
margin: 10px 0 5px 0;
|
200 |
+
line-height: 1.1;
|
201 |
+
letter-spacing: -0.03em;
|
202 |
+
position: relative;
|
203 |
+
z-index: 1;
|
204 |
+
}
|
205 |
+
|
206 |
+
.title-primary {
|
207 |
+
background: linear-gradient(45deg,
|
208 |
+
#4169E1 0%,
|
209 |
+
#8A2BE2 50%,
|
210 |
+
#E91E63 100%);
|
211 |
+
-webkit-background-clip: text;
|
212 |
+
-webkit-text-fill-color: transparent;
|
213 |
+
background-clip: text;
|
214 |
+
position: relative;
|
215 |
+
}
|
216 |
+
.title-primary::after {
|
217 |
+
content: '';
|
218 |
+
position: absolute;
|
219 |
+
bottom: -8px;
|
220 |
+
left: 0;
|
221 |
+
right: 0;
|
222 |
+
height: 4px;
|
223 |
+
background: linear-gradient(90deg, #6366f1, #8b5cf6, #d946ef);
|
224 |
+
border-radius: 2px;
|
225 |
+
opacity: 0.6;
|
226 |
+
}
|
227 |
+
|
228 |
+
.title-accent {
|
229 |
+
background: linear-gradient(135deg, #1e293b 0%, #0d1e35 100%);
|
230 |
+
-webkit-background-clip: text;
|
231 |
+
-webkit-text-fill-color: transparent;
|
232 |
+
background-clip: text;
|
233 |
+
position: relative;
|
234 |
+
margin-left: 12px;
|
235 |
+
}
|
236 |
+
|
237 |
+
|
238 |
+
|
239 |
+
.hero-subtitle {
|
240 |
+
font-size: 1.25rem;
|
241 |
+
color: #64748b;
|
242 |
+
font-weight: 400;
|
243 |
+
margin: 15px auto 20px auto;
|
244 |
+
line-height: 1.6;
|
245 |
+
max-width: 580px;
|
246 |
+
display: flex;
|
247 |
+
align-items: center;
|
248 |
+
justify-content: center;
|
249 |
+
gap: 12px;
|
250 |
+
}
|
251 |
+
|
252 |
+
.status-dot {
|
253 |
+
display: inline-block;
|
254 |
+
width: 8px;
|
255 |
+
height: 8px;
|
256 |
+
background: #10b981;
|
257 |
+
border-radius: 50%;
|
258 |
+
position: relative;
|
259 |
+
flex-shrink: 0;
|
260 |
+
}
|
261 |
+
|
262 |
+
.status-dot::before {
|
263 |
+
content: '';
|
264 |
+
position: absolute;
|
265 |
+
top: 0;
|
266 |
+
left: 0;
|
267 |
+
width: 8px;
|
268 |
+
height: 8px;
|
269 |
+
background: #10b981;
|
270 |
+
border-radius: 50%;
|
271 |
+
animation: ping 2s cubic-bezier(0, 0, 0.2, 1) infinite;
|
272 |
+
transform-origin: center;
|
273 |
+
}
|
274 |
+
|
275 |
+
@keyframes ping {
|
276 |
+
|
277 |
+
75%,
|
278 |
+
100% {
|
279 |
+
transform: scale(2);
|
280 |
+
opacity: 0;
|
281 |
+
}
|
282 |
+
}
|
283 |
+
|
284 |
+
.feature-pills {
|
285 |
+
display: flex;
|
286 |
+
justify-content: center;
|
287 |
+
gap: 12px;
|
288 |
+
margin-top: 20px;
|
289 |
+
flex-wrap: wrap;
|
290 |
+
}
|
291 |
+
|
292 |
+
.pill {
|
293 |
+
background: linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(248, 250, 252, 0.9));
|
294 |
+
border: 1px solid rgba(148, 163, 184, 0.15);
|
295 |
+
border-radius: 25px;
|
296 |
+
padding: 6px 14px;
|
297 |
+
font-size: 0.8rem;
|
298 |
+
font-weight: 600;
|
299 |
+
color: #475569;
|
300 |
+
backdrop-filter: blur(12px);
|
301 |
+
transition: all 0.25s cubic-bezier(0.4, 0, 0.2, 1);
|
302 |
+
white-space: nowrap;
|
303 |
+
box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
|
304 |
+
letter-spacing: 0.025em;
|
305 |
+
}
|
306 |
+
|
307 |
+
.pill:hover {
|
308 |
+
background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.05));
|
309 |
+
border-color: rgba(99, 102, 241, 0.25);
|
310 |
+
color: #6366f1;
|
311 |
+
transform: translateY(-1px) scale(1.02);
|
312 |
+
box-shadow: 0 4px 12px rgba(99, 102, 241, 0.15), 0 2px 4px rgba(0, 0, 0, 0.05);
|
313 |
+
}
|
314 |
+
|
315 |
+
/* Ensure proper vertical alignment for chat input row */
|
316 |
+
.items-center {
|
317 |
+
display: flex !important;
|
318 |
+
align-items: center !important;
|
319 |
+
gap: 8px !important;
|
320 |
+
}
|
321 |
+
|
322 |
+
/* Responsive design for mobile */
|
323 |
+
@media (max-width: 768px) {
|
324 |
+
.ultra-sleek-header {
|
325 |
+
padding: 15px 15px 20px 15px;
|
326 |
+
}
|
327 |
+
|
328 |
+
.hero-title {
|
329 |
+
font-size: 3rem;
|
330 |
+
}
|
331 |
+
|
332 |
+
.hero-subtitle {
|
333 |
+
font-size: 1.125rem;
|
334 |
+
padding: 0 15px;
|
335 |
+
flex-direction: column;
|
336 |
+
gap: 8px;
|
337 |
+
}
|
338 |
+
|
339 |
+
.feature-pills {
|
340 |
+
gap: 8px;
|
341 |
+
margin-top: 24px;
|
342 |
+
}
|
343 |
+
|
344 |
+
.pill {
|
345 |
+
font-size: 0.8rem;
|
346 |
+
padding: 6px 12px;
|
347 |
+
}
|
348 |
+
}
|
349 |
+
|
350 |
+
@media (max-width: 480px) {
|
351 |
+
.hero-title {
|
352 |
+
font-size: 2.5rem;
|
353 |
+
line-height: 1.2;
|
354 |
+
}
|
355 |
+
|
356 |
+
.title-accent {
|
357 |
+
margin-left: 8px;
|
358 |
+
}
|
359 |
+
}
|
360 |
+
.gradio-container .chatbot .message-wrap button[aria-label*="clear" i],
|
361 |
+
.gradio-container .chatbot .message-wrap button[title*="clear" i] {
|
362 |
+
display: none !important;
|
363 |
+
}
|
364 |
+
|
365 |
+
/* Target any button with clear-related text content */
|
366 |
+
button:contains("Clear"),
|
367 |
+
button:contains("clear"),
|
368 |
+
button:contains("CLEAR") {
|
369 |
+
display: none !important;
|
370 |
+
}
|
371 |
+
|
372 |
+
/* Target buttons in chat interfaces specifically */
|
373 |
+
.chat-container button[aria-label*="clear" i],
|
374 |
+
.chatbot button[aria-label*="clear" i],
|
375 |
+
.message-wrap button[aria-label*="clear" i] {
|
376 |
+
display: none !important;
|
377 |
+
}
|
378 |
+
|
379 |
+
/* Hide buttons with specific SVG icons that might represent clear/delete */
|
380 |
+
button svg[data-testid*="clear"],
|
381 |
+
button svg[data-testid*="delete"],
|
382 |
+
button svg[data-testid*="trash"] {
|
383 |
+
display: none !important;
|
384 |
+
}
|
385 |
+
|
386 |
+
/* Hide parent button if it contains clear-related SVG */
|
387 |
+
button:has(svg[data-testid*="clear"]),
|
388 |
+
button:has(svg[data-testid*="delete"]),
|
389 |
+
button:has(svg[data-testid*="trash"]) {
|
390 |
+
display: none !important;
|
391 |
+
}
|