Spaces:

jing-bi
/

perceptual-copilot

Running

App Files Files Community

jing-bi commited on 3 days ago

Commit

c564b63

0 Parent(s):

Fresh deploy: all latest files

Browse files

Files changed (11) hide show

.gitignore +194 -0
LICENSE +21 -0
README.md +93 -0
app/agent.py +27 -0
app/config.py +40 -0
app/memory.py +278 -0
app/tool.py +184 -0
app/utils.py +72 -0
main.py +183 -0
requirements.txt +7 -0
styles.css +391 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,194 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+archive/
+# C extensions
+*.so
+test*
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/latest/usage/project/#working-with-version-control
+.pdm.toml
+.pdm-python
+.pdm-build/
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the enitre vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Jing Bi
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,93 @@

+---
+title: Perceptual Copilot
+emoji: 👁️
+colorFrom: yellow
+colorTo: purple
+sdk: gradio
+sdk_version: 5.33.1
+app_file: main.py
+pinned: false
+license: mit
+---
+## ✨ What is Perceptual Copilot?
+Perceptual Copilot is a prototype that demonstrates the integration of OpenAI agents with visual tools to process real-time video streams. This experimental platform showcases both the promising potential and current limitations of equipping agents with vision capabilities to understand and interact with live visual data.
+### Architecture Overview
+```
+┌─────────────────┐    ┌─────────────────┐    ┌─────────────────┐
+│      Webcam     │───▶│      Memory     │◀──▶│      Gradio     │
+└─────────────────┘    └─────────────────┘    └─────────────────┘
+                                │
+                                ▼
+                       ┌─────────────────┐    ┌─────────────────┐
+                       │      Agent      │◀──▶│      Tools      │
+                       └─────────────────┘    └─────────────────┘
+```
+### Available Tools
+| Tool | Description | Output |
+|------|-------------|---------|
+| `caption` | Generate detailed image descriptions | Rich visual descriptions |
+| `ocr` | Extract text from images | Extracted text content |
+| `localize` | Detect and locate objects | Bounding boxes with labels |
+| `qa` | Answer questions about images | Contextual answers |
+| `time` | Get current timestamp | Current date and time |
+| _More tools coming soon..._ | Additional capabilities in development | Various outputs |
+## 🚀 Quick Start
+### Prerequisites
+- Webcam access
+### Installation
+1. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+2. **Set up environment variables**
+   ```bash
+   export HF_TOKEN="your_huggingface_token"
+   export API_KEY="your_openai_api_key"
+   export END_LANG="your_llm_endpoint"
+   export END_TASK="your_task_endpoint"
+   export MODEL_AGENT="your_agent_model"
+   export MODEL_MLLM="your_multimodal_model"
+   export MODEL_LOC="your_localization_model"
+   ```
+3. **Launch the application**
+   ```bash
+   python main.py
+   ```
+## 💡 Usage Examples
+### Basic Interaction
+- **User**: "What do you see?"
+- **Assistant**: *Generates detailed caption of current view*
+### OCR Functionality
+- **User**: "Read the text in this document"
+- **Assistant**: *Extracts and returns all visible text*
+### Object Detection
+- **User**: "What objects are in front of me?"
+- **Assistant**: *Identifies and localizes objects with bounding boxes*
+## Acknowledgments
+- Built with [Gradio](https://gradio.app/) for the interactive web interface
+- Uses [Supervision](https://supervision.roboflow.com/) for frame annotation
+- WebRTC integration via [FastRTC](https://github.com/gradio-app/gradio)

app/agent.py ADDED Viewed

	@@ -0,0 +1,27 @@

+from agents import Agent
+from app.memory import Memory
+from openai import AsyncOpenAI
+from app.config import env
+from agents import set_default_openai_client, set_default_openai_api, set_tracing_disabled
+from app.tool import caption, ocr, localize, qa, time, video_caption, video_qa
+def build_agent():
+    client = AsyncOpenAI(base_url=env.end_lang,api_key=env.api_key)
+    set_default_openai_client(client=client, use_for_tracing=False)
+    set_default_openai_api("chat_completions")
+    set_tracing_disabled(disabled=True)
+    chat_agent = Agent[Memory](
+        name="Assistant",
+        tools=[caption, ocr, qa, time, localize, video_caption, video_qa],
+        model=env.model_agent,
+        instructions=(
+            "As a helpful assistant, your functions include answering questions about images, "
+            "Optical Character Recognition (OCR), image caption generation, object localization "
+            "within images, and video caption generation and Q&A. For video-related tools, you "
+            "will need to determine the appropriate time window to analyze from the past."
+        ),
+    )
+    return chat_agent

app/config.py ADDED Viewed

	@@ -0,0 +1,40 @@

+import os
+import logging
+from openai import OpenAI
+try:
+    from dotenv import load_dotenv
+    load_dotenv()
+except ImportError:
+    pass
+class Envs:
+    def __init__(self):
+        self.hf_token = os.getenv("HF_TOKEN")
+        self.api_key = os.getenv("API_KEY")
+        self.end_task = os.getenv("END_TASK")
+        self.end_lang = os.getenv("END_LANG")
+        self.model_agent = os.getenv("MODEL_AGENT")
+        self.model_mllm = os.getenv("MODEL_MLLM")
+        self.model_loc = os.getenv("MODEL_LOC")
+        # Only initialize OpenAI client if we have the required env vars
+        if self.end_lang and self.api_key:
+            self.client = OpenAI(base_url=self.end_lang, api_key=self.api_key)
+        else:
+            self.client = None
+            print("WARNING: OpenAI client not initialized due to missing environment variables")
+        self.debug = os.getenv("DEBUG", "1").lower() in ("true", "1", "yes")
+        self.fps = int(os.getenv("FPS"))
+env = Envs()
+logger = logging.getLogger('copilot')
+logger.setLevel(logging.DEBUG if env.debug else logging.INFO)
+logger.addHandler(logging.StreamHandler())

app/memory.py ADDED Viewed

	@@ -0,0 +1,278 @@

+import asyncio
+from dataclasses import dataclass, field
+from agents import Runner, RunHooks
+import threading
+from typing import Any, Dict, Optional, List
+import traceback
+import time
+from datetime import datetime
+import numpy as np
+import gradio as gr
+from .config import logger, env
+@dataclass
+class RunnerStep:
+    """Log entry for a single Runner step"""
+    timestamp: str
+    step_type: str
+    agent_name: str
+    turn_number: int
+    details: Dict[str, Any] = field(default_factory=dict)
+    duration_ms: Optional[float] = None
+    def __str__(self) -> str:
+        return f"[{self.timestamp}][T{self.turn_number}][{self.step_type}]: {self.details}"
+@dataclass
+class Message:
+    role: str
+    content: str
+    mode: str
+    metadata: Dict[str, Any] = field(default_factory=dict)
+    @classmethod
+    def user(cls, content: str) -> "Message":
+        return cls("user", content, '')
+    @classmethod
+    def system(cls, content: str) -> "Message":
+        return cls("system", content, '')
+    @classmethod
+    def tool(cls, content: str, **kwargs) -> "Message":
+        return cls("assistant", content, 'tool', kwargs)
+    @classmethod
+    def assistant(cls, content: str, mode='') -> "Message":
+        return cls("assistant", content, mode)
+    @classmethod
+    def tts(cls, content: str) -> "Message":
+        return cls("assistant", content, 'tts')
+    def to_dict(self) -> Dict[str, Any]:
+        result = {"role": self.role, "content": self.content}
+        if self.mode == "tool":
+            metadata = self.metadata.copy()
+            if title := metadata.get("title"):
+                metadata["title"] = title.title()
+            result["metadata"] = metadata
+        return result
+@dataclass
+class Snapshot:
+    sender: str
+    data: Any
+    @property
+    def gr(self):
+        if isinstance(self.data, np.ndarray):
+            return gr.Image(self.data)
+        return self.data
+class RunnerLoggerHooks(RunHooks):
+    """Custom hooks to log every step of the Runner"""
+    def __init__(self, memory_instance):
+        super().__init__()
+        self.memory = memory_instance
+        self.current_turn = 0
+        self.turn_start_time = None
+    async def on_agent_start(self, context, agent):
+        self.current_turn += 1
+        self.turn_start_time = time.time()
+        step = RunnerStep(
+            timestamp=datetime.now().isoformat(),
+            step_type="turn_start",
+            agent_name=agent.name,
+            turn_number=self.current_turn,
+            details={"message": f"Starting turn {self.current_turn} with agent {agent.name}"}
+        )
+        self.memory.log_runner_step(step)
+    async def on_agent_end(self, context, agent, result):
+        if self.turn_start_time:
+            duration = (time.time() - self.turn_start_time) * 1000
+        else:
+            duration = None
+        step = RunnerStep(
+            timestamp=datetime.now().isoformat(),
+            step_type="agent_call",
+            agent_name=agent.name,
+            turn_number=self.current_turn,
+            details={"message": f"Agent {agent.name} completed", "result_type": type(result).__name__},
+            duration_ms=duration
+        )
+        self.memory.log_runner_step(step)
+    async def on_tool_start(self, context, agent, tool_call):
+        tool_name = getattr(tool_call, 'name', 'unknown')
+        tool_args = None
+        for attr in ['arguments', 'args', 'function', 'parameters']:
+            if hasattr(tool_call, attr):
+                tool_args = getattr(tool_call, attr)
+                break
+        step = RunnerStep(
+            timestamp=datetime.now().isoformat(),
+            step_type="tool_call",
+            agent_name=agent.name,
+            turn_number=self.current_turn,
+            details={
+                "tool_name": tool_name,
+                "tool_args": tool_args,
+                "message": f"Calling tool {tool_name}"
+            }
+        )
+        self.memory.log_runner_step(step)
+    async def on_tool_end(self, context, agent, tool_call, result):
+        # Handle different tool_call object attributes safely
+        tool_name = getattr(tool_call, 'name', 'unknown')
+        step = RunnerStep(
+            timestamp=datetime.now().isoformat(),
+            step_type="tool_result",
+            agent_name=agent.name,
+            turn_number=self.current_turn,
+            details={
+                "tool_name": tool_name,
+                "result_length": len(str(result)) if result else 0,
+                "message": f"Tool {tool_name} completed"
+            }
+        )
+        self.memory.log_runner_step(step)
+class Chat:
+    def __init__(self):
+        self.history = []
+    def append(self, message: Message):
+        self.history.append(message)
+    @property
+    def messages(self):
+        return [i.to_dict() for i in self.history]
+class Memory:
+    def __init__(self, agent, limit: int = 200) -> None:
+        self.limit: int = limit
+        self.frames: list[Any] = []
+        self.snapshots: list[Any] = []
+        self.inputs: list[Any] = []
+        self.chat = Chat()
+        self.runner_steps: List[RunnerStep] = []
+        self.step_limit: int = 1000  # Keep last 1000 steps
+        self.logger_hooks: Optional[RunnerLoggerHooks] = None
+        self._chat_q: asyncio.Queue[Any] = asyncio.Queue()
+        self._input_q: asyncio.Queue[Any] = asyncio.Queue()
+        self._loop: Optional[asyncio.AbstractEventLoop] = None
+        self.is_waiting: bool = False
+        self.is_running: bool = False
+        self._last_frame_time: float = 0
+        self.setup(agent)
+    def log_runner_step(self, step: RunnerStep) -> None:
+        """Log a runner step and maintain the step history limit"""
+        self.runner_steps.append(step)
+        logger.debug(f"[ 🛠️ ]{step}")
+        while len(self.runner_steps) > self.step_limit:
+            self.runner_steps.pop(0)
+    def enqueue(self, data: Any) -> None:
+        current_time = time.time()
+        if  current_time-self._last_frame_time > 1.0 / env.fps:
+            self._last_frame_time = current_time
+            self.frames.append(data)
+            while len(self.frames) > self.limit:
+                self.frames.pop(0)
+        return self.snapshots.pop(0) if self.snapshots else None
+    def receive(self, text: str) -> None:
+        self.chat.append(Message.user(text))
+        self._loop.call_soon_threadsafe(self._chat_q.put_nowait, text)
+    def setup(self, agent) -> None:
+        """Bind *agent* and spawn the background monitor threads."""
+        self.v_agent = agent
+        self.logger_hooks = RunnerLoggerHooks(self)
+        def _runner() -> None:
+            self._loop = asyncio.new_event_loop()
+            asyncio.set_event_loop(self._loop)
+            try:
+                self._loop.create_task(self._monitor_chat())
+                self._loop.run_forever()
+            finally:
+                self._loop.close()
+        threading.Thread(target=_runner, daemon=True).start()
+    async def _monitor_chat(self) -> None:
+        """Process incoming chat messages, respecting the waiting gate."""
+        while True:
+            text = await self._chat_q.get()
+            logger.debug(f"Processing: {text}")
+            start_step = RunnerStep(
+                timestamp=datetime.now().isoformat(),
+                step_type="processing_start",
+                agent_name=getattr(self.v_agent, 'name', 'unknown'),
+                turn_number=0,
+                details={"user_input": text}
+            )
+            self.log_runner_step(start_step)
+            try:
+                self.is_running = True
+                result = await Runner.run(
+                    starting_agent=self.v_agent,
+                    input=text,
+                    context=self,
+                    hooks=self.logger_hooks  # Add our custom hooks here
+                )
+                self.is_running = False
+                # Log successful completion
+                success_step = RunnerStep(
+                    timestamp=datetime.now().isoformat(),
+                    step_type="final_output",
+                    agent_name=getattr(self.v_agent, 'name', 'unknown'),
+                    turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
+                    details={
+                        "output_type": type(result.final_output).__name__,
+                        "output_preview": str(result.final_output)[:100] + "..." if len(str(result.final_output)) > 100 else str(result.final_output)
+                    }
+                )
+                self.log_runner_step(success_step)
+            except Exception as exc:  # noqa: BLE001
+                self.is_running = False
+                full_traceback = traceback.format_exc()
+                logger.debug(f"Error in _monitor_chat: {exc}\n{full_traceback}")
+                # Log the error
+                error_step = RunnerStep(
+                    timestamp=datetime.now().isoformat(),
+                    step_type="error",
+                    agent_name=getattr(self.v_agent, 'name', 'unknown'),
+                    turn_number=self.logger_hooks.current_turn if self.logger_hooks else 0,
+                    details={
+                        "error_type": type(exc).__name__,
+                        "error_message": str(exc),
+                        "traceback": full_traceback
+                    }
+                )
+                self.log_runner_step(error_step)
+                continue
+            final = result.final_output.split('</think>', 1)[-1]
+            self.chat.append(Message.assistant(final))
+            await asyncio.sleep(0)

app/tool.py ADDED Viewed

	@@ -0,0 +1,184 @@

+import datetime
+import json
+import cv2
+import httpx
+from app.config import env
+from app.utils import image_w_box, encode_image
+from agents import RunContextWrapper, function_tool
+from app.memory import Memory,Snapshot
+def task(name, image):
+    resp = httpx.post(f"{env.end_task}",
+        data={"name": name},
+        files={"file": ("frame.jpg", image.tobytes(), "image/jpeg")},
+        timeout=10,
+        headers={"Authorization": env.api_key},
+    )
+    resp.raise_for_status()
+    return resp.json()['result']
+def completion(messages, model):
+    response = env.client.chat.completions.create(
+        model=model,
+        messages=messages
+    )
+    return response.choices[0].message.content
+def completion_image(images, prompt, model):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": prompt},
+                {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}},
+            ],
+        }
+        for b64, mime in map(encode_image, images)
+    ]
+    return completion(messages, model=model)
+# ------------------------ Function Tools ------------------------
+@function_tool
+def caption(wrapper: RunContextWrapper[Memory]) -> str:
+    """
+    Generate a descriptive caption for the most recent frame, record it as a snapshot, and return it.
+    Returns:
+        str:
+            The generated caption for the current view (i.e., the latest frame).
+    """
+    mem = wrapper.context
+    prompt = "Describe the image with rich details but in a concise manner."
+    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
+    mem.snapshots.append(Snapshot(sender='caption', data=result))
+    return result
+@function_tool
+def ocr(wrapper: RunContextWrapper[Memory]) -> str:
+    """
+    Perform OCR on the most recent frame, record it as a snapshot, and return the extracted text.
+    Returns:
+        str:
+            The extracted text from the current view (i.e., the latest frame).
+    """
+    mem = wrapper.context
+    prompt = "Extract all text from image/payslip without miss anything."
+    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
+    mem.snapshots.append(Snapshot(sender='ocr', data=result))
+    return result
+@function_tool
+def qa(wrapper: RunContextWrapper[Memory], question: str) -> str:
+    """
+    Answer a question based on the most recent frame, record it as a snapshot, and return the answer.
+    Args:
+        question (str): The question to be answered.
+    Returns:
+        str:
+            The answer to the question based on the current view (i.e., the latest frame).
+    """
+    mem = wrapper.context
+    prompt = f"Answer the question based on the image. Question: {question}"
+    result = completion_image([mem.frames[-1]], prompt, env.model_mllm)
+    mem.snapshots.append(Snapshot(sender='qa', data=result))
+    return result
+@function_tool
+def localize(wrapper: RunContextWrapper[Memory]) -> str:
+    """
+    Localize all objects in the most recent frame
+    Returns:
+        str:
+            The localization result for the current view (i.e., the latest frame).
+            the format is {name:list of bboxes}
+    """
+    mem = wrapper.context
+    frame = mem.frames[-1]
+    _, img = cv2.imencode('.jpg', frame)
+    objxbox = task(env.model_loc, img)
+    mem.snapshots.append(Snapshot(sender='localize', data=image_w_box(frame, objxbox)))
+    return json.dumps(objxbox, indent=2)
+@function_tool
+def time(wrapper: RunContextWrapper[Memory]) -> str:
+    """
+    Get the current time, record it as a snapshot, and return the time.
+    Returns:
+        str:
+            The current time.
+    """
+    mem = wrapper.context
+    result = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+    mem.snapshots.append(Snapshot(sender='time', data=result))
+    return result
+def sample_frames(mem: Memory, n: int) -> list:
+    """
+    Sample frames from the past n seconds of video.
+    Args:
+        mem (Memory): The memory context containing frames.
+        n (int): Number of seconds to look back for video frames.
+    Returns:
+        list: Sampled frames from the video sequence.
+    """
+    if len(mem.frames) == 0:
+        return []
+    available_frames = min(n * env.fps, len(mem.frames))
+    recent_frames = mem.frames[-available_frames:]
+    sampled_frames = recent_frames[::env.fps // 2]
+    return sampled_frames
+@function_tool
+def video_caption(wrapper: RunContextWrapper[Memory], n=2) -> str:
+    """
+    Generate a descriptive caption for a video sequence from the past n seconds of frames.
+    The n is a required parameter that specifies how many seconds of video frames to consider.
+    Args:
+        n (int): Number of seconds to look back for video frames.
+    Returns:
+        str:
+            The generated caption for the video sequence from the past n seconds.
+    """
+    mem = wrapper.context
+    sampled_frames = sample_frames(mem, n)
+    if len(sampled_frames) == 0:
+        return "No frames available for video caption."
+    prompt = "Describe this video sequence focusing on any changes or actions that occur over time."
+    result = completion_image(sampled_frames, prompt, env.model_mllm)
+    mem.snapshots.append(Snapshot(sender='video caption', data=result))
+    return result
+@function_tool
+def video_qa(wrapper: RunContextWrapper[Memory], question: str, n=2) -> str:
+    """
+    Answer a question based on a video sequence from the past n seconds of frames.
+    Args:
+        question (str): The question to be answered.
+        n (int): Number of seconds to look back for video frames.
+    Returns:
+        str:
+            The answer to the question based on the video sequence from the past n seconds.
+    """
+    mem = wrapper.context
+    sampled_frames = sample_frames(mem, n)
+    if len(sampled_frames) == 0:
+        return "No frames available for video Q&A."
+    prompt = f"Answer the question based on this video sequence. Question: {question}"
+    result = completion_image(sampled_frames, prompt, env.model_mllm)
+    mem.snapshots.append(Snapshot(sender='video qa', data=result))
+    return result

app/utils.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from PIL import Image
+import base64
+import supervision as sv
+import numpy as np
+import cv2
+colors = sv.ColorPalette.from_hex(
+    [
+        "#a1c9f4",
+        "#ffb482",
+        "#8de5a1",
+        "#ff9f9b",
+        "#d0bbff",
+        "#debb9b",
+        "#fab0e4",
+        "#cfcfcf",
+        "#fffea3",
+        "#b9f2f0",
+        "#a1c9f4",
+        "#ffb482",
+        "#8de5a1",
+        "#ff9f9b",
+        "#d0bbff",
+        "#debb9b",
+        "#fab0e4",
+        "#cfcfcf",
+        "#fffea3",
+        "#b9f2f0",
+    ]
+)
+def image_w_box(image,objxbox):
+    box_annotator = sv.BoxCornerAnnotator(thickness=10, corner_length=30, color=colors)
+    label_annotator = sv.LabelAnnotator(color=colors)
+    mask_annotator = sv.MaskAnnotator(opacity=0.2, color=colors)
+    xyxys = np.array([v for boxes in objxbox.values() for v in boxes])
+    unique_labels = sorted(objxbox.keys())
+    class_id_map = dict(enumerate(unique_labels))
+    labels = [l for l, boxes in objxbox.items() for _ in boxes]
+    class_id = [list(class_id_map.values()).index(label) for label in labels]
+    masks = np.zeros((len(xyxys), image.shape[0], image.shape[1]), dtype=bool)
+    for i, (x1, y1, x2, y2) in enumerate(xyxys):
+        masks[i, int(y1):int(y2), int(x1):int(x2)] = labels[i]
+    if len(xyxys) == 0:
+        return image
+    detections = sv.Detections(
+        xyxy=xyxys,
+        mask=masks,
+        class_id=np.array(class_id),
+    )
+    # Convert RGB to BGR for annotation
+    image_bgr = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
+    # After annotation, convert back to RGB
+    annotated_image = box_annotator.annotate(scene=image_bgr.copy(), detections=detections)
+    annotated_image = label_annotator.annotate(scene=annotated_image, detections=detections, labels=labels)
+    annotated_image = mask_annotator.annotate(scene=annotated_image, detections=detections)
+    return cv2.cvtColor(annotated_image, cv2.COLOR_BGR2RGB)
+def encode_image(img) -> tuple[str, str]:
+    arr = np.array(img.convert("RGB")) if isinstance(img, Image.Image) else img
+    if not isinstance(arr, np.ndarray):
+        raise ValueError("Unsupported image type")
+    ok, buf = cv2.imencode('.jpg', cv2.cvtColor(arr, cv2.COLOR_RGB2BGR))
+    if not ok:
+        raise ValueError("Encoding failed")
+    b64 = base64.b64encode(buf).decode('utf-8')
+    return b64, "image/jpeg"

main.py ADDED Viewed

	@@ -0,0 +1,183 @@

+from pathlib import Path
+import os
+import cv2
+import gradio as gr
+from fastrtc import Stream,WebRTC
+from app.config import env
+from fastrtc import AdditionalOutputs
+from app.memory import Memory,Message
+from fastrtc import get_cloudflare_turn_credentials
+from app.agent import build_agent
+from fastrtc import get_current_context
+session_memories = {}
+def get_session_memory(session_id: str = None) -> Memory:
+    if session_id not in session_memories:
+        session_memories[session_id] = Memory(build_agent())
+        welcome_message = "👋 Now I can see. Feel free to ask me about anything!"
+        session_memories[session_id].chat.append(Message.assistant(welcome_message))
+    return session_memories[session_id]
+def video_handler(frame):
+    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
+    rtcid = get_current_context().webrtc_id
+    mem = get_session_memory(rtcid)
+    if (snapshot := mem.enqueue(frame)):
+        mem.chat.append(Message.tool(snapshot.gr, title=snapshot.sender, status='done'))
+    return frame, AdditionalOutputs(mem.chat.messages, rtcid)
+def chat_handler(text, webrtc_state):
+    if webrtc_state is None:
+        return "", [{"role": "assistant", "content": "Please start your camera first to begin the conversation."}], webrtc_state
+    mem = get_session_memory(webrtc_state)
+    if not mem.is_running:
+        mem.receive(text.strip())
+    return "", mem.chat.messages, webrtc_state
+if __name__ == "__main__":
+    print("🚀 Starting Perceptual Copilot...")
+    print(f"HF Spaces: {os.getenv('SPACE_ID') is not None}")
+    print(f"Environment check - API_KEY: {'✓' if env.api_key else '✗'}")
+    print(f"Environment check - END_LANG: {'✓' if env.end_lang else '✗'}")
+    print(f"Environment check - OpenAI Client: {'✓' if env.client else '✗'}")
+    with gr.Blocks(
+        title="🤖 Perceptual Copilot - AI Vision Assistant",
+        theme=gr.themes.Soft(
+            primary_hue="blue",
+            secondary_hue="orange",
+            neutral_hue="slate",
+            font=("system-ui", "sans-serif")
+        ),
+        css=Path("styles.css").read_text(),
+    ) as demo:
+        # Header section with sleek styling
+        gr.Markdown("""
+        <div class="ultra-sleek-header">
+            <h1 class="hero-title">
+                <span class="title-primary">Perceptual</span>
+                <span class="title-accent">Copilot</span>
+            </h1>
+            <p class="hero-subtitle">
+                <span class="status-dot"></span>
+                An experimental prototype that integrates OpenAI agents with visual tools to process real-time video streams.
+            </p>
+            <div class="feature-pills">
+                <span class="pill">Real-time streaming</span>
+                <span class="pill">Visual Agent</span>
+                <span class="pill">Large vision language model</span>
+                <span class="pill">Reasoning</span>
+            </div>
+        </div>
+        """, elem_classes="ultra-sleek-header")
+        state = gr.State(value=None)
+        # Main interface with improved layout
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, elem_classes="video-container"):
+                video = WebRTC(
+                    label="🎥 Camera Stream",
+                    rtc_configuration=get_cloudflare_turn_credentials(hf_token=env.hf_token),
+                    track_constraints={
+                        "width": {"exact": 600},
+                        "height": {"exact": 600},
+                        "aspectRatio": {"exact": 1}},
+                    mode="send",
+                    modality="video",
+                    mirror_webcam=True,
+                    width=600,
+                    height=600,
+                )
+            with gr.Column(scale=1, elem_classes="chat-container"):
+                gr.Markdown("### 💬 Chat")
+                chatbot = gr.Chatbot(
+                    type="messages",
+                    height=450,
+                    label="🤖 AI Assistant",
+                    placeholder="Chat history will appear here...",
+                    show_label=False,
+                )
+                with gr.Row(elem_classes="items-center"):
+                    textbox = gr.Textbox(
+                        placeholder="💭 Question goes here, press ENTER to send",
+                        lines=1,
+                        show_label=False,
+                    )
+        # Event handlers
+        video.stream(
+            fn=video_handler,
+            inputs=[video],
+            outputs=[video],
+            concurrency_limit=10,
+        )
+        video.on_additional_outputs(
+            fn=lambda messages, webrtc_id: (messages, webrtc_id),
+            outputs=[chatbot, state]
+        )
+        # Chat handler for textbox
+        textbox.submit(
+            chat_handler,
+            inputs=[textbox, state],
+            outputs=[textbox, chatbot, state]
+        )
+        # Enhanced instructions section
+        with gr.Column(elem_classes="instructions-container"):
+            gr.Markdown("""
+            ## 🚀 Get Started
+            **📌 Quick Reminder:**
+            1. Allow camera access when prompted
+            2. Wait for the camera to initialize and first message to appear
+            3. 💡 **Tip:** If you find it hard to see the interface, please turn off night mode for better visibility
+            """)
+            with gr.Row():
+                with gr.Column():
+                    gr.Markdown("""
+                    ### 💡 Example Prompts
+                    **🌍 General Vision:**
+                    - *"What do you see in front of me?"*
+                    - *"What's the overall environment like?"*
+                    **📄 Text & Documents:**
+                    - *"Read the text in this document"*
+                    - *"Extract the code snippet from this image"*
+                    **🔍 Object Recognition:**
+                    - *"What objects are visible?"*
+                    - *"Help me identify this item"*
+                    """)
+                with gr.Column():
+                    gr.Markdown("""
+                    ### 🔧 Current Capabilities
+                    **🚀 Available Features:**
+                    - **OCR** - Text extraction and reading
+                    - **Q&A** - Visual question answering
+                    - **Caption** - Scene description and analysis
+                    - **Localization** - Object detection and positioning
+                    - **Time** - Current time and temporal context
+                    **📈 More Coming Soon:**
+                    We're continuously adding new capabilities to enhance your visual AI experience.
+                    **⚠️ Important Note:**
+                    All models are self-hosted. Please avoid abuse of the system.
+                    """)
+    demo.queue(default_concurrency_limit=None)
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+huggingface_hub
+python-dotenv
+supervision
+openai-agents
+fastrtc==0.0.21
+gradio
+pydantic==2.10.6

styles.css ADDED Viewed

	@@ -0,0 +1,391 @@

+.gradio-container {
+    background: linear-gradient(135deg, #fefefe 0%, #f8f6f0 100%) !important;
+    font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
+}
+.main-header {
+    background: rgba(255, 255, 255, 0.95);
+    border-radius: 15px;
+    padding: 20px;
+    margin: 10px 0;
+    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.1);
+    backdrop-filter: blur(10px);
+    border: 1px solid rgba(255, 255, 255, 0.2);
+}
+.video-container {
+    background: rgba(255, 255, 255, 0.9);
+    border-radius: 12px;
+    padding: 20px;
+    box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
+    backdrop-filter: blur(8px);
+    border: 1px solid rgba(255, 255, 255, 0.3);
+}
+.chat-container {
+    background: rgba(255, 255, 255, 0.9);
+    border-radius: 12px;
+    padding: 20px;
+    box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
+    backdrop-filter: blur(8px);
+    border: 1px solid rgba(255, 255, 255, 0.3);
+}
+.chat-container textarea {
+    border: 3px solid #4285f4 !important;
+    border-radius: 12px !important;
+    padding: 15px !important;
+    font-size: 16px !important;
+    font-weight: 500 !important;
+    background: #ffffff !important;
+    box-shadow: 0 4px 20px rgba(66, 133, 244, 0.15) !important;
+    transition: all 0.3s ease !important;
+    min-height: 50px !important;
+    display: flex !important;
+    align-items: center !important;
+    justify-content: center !important;
+    resize: none !important;
+}
+.chat-container textarea:focus {
+    border-color: #1a73e8 !important;
+    box-shadow: 0 6px 25px rgba(66, 133, 244, 0.3), 0 0 0 3px rgba(66, 133, 244, 0.1) !important;
+    outline: none !important;
+    transform: translateY(-2px) !important;
+}
+.chat-container textarea::placeholder {
+    color: #5f6368 !important;
+    font-weight: 400 !important;
+    font-size: 15px !important;
+}
+.chat-container .gr-text-input {
+    border: none !important;
+    background: transparent !important;
+}
+.instructions-container {
+    background: rgba(255, 255, 255, 0.9);
+    border-radius: 12px;
+    padding: 25px;
+    margin: 20px 0;
+    box-shadow: 0 2px 12px rgba(0, 0, 0, 0.04);
+    backdrop-filter: blur(8px);
+    border: 1px solid rgba(255, 255, 255, 0.3);
+}
+.feature-card {
+    background: linear-gradient(135deg, #ff6b6b, #feca57);
+    border-radius: 10px;
+    padding: 15px;
+    margin: 10px 0;
+    color: white;
+    box-shadow: 0 4px 15px rgba(0, 0, 0, 0.2);
+}
+.status-indicator {
+    display: inline-block;
+    width: 10px;
+    height: 10px;
+    background: #4CAF50;
+    border-radius: 50%;
+    margin-right: 8px;
+    animation: pulse 2s infinite;
+}
+@keyframes pulse {
+    0% { opacity: 1; }
+    50% { opacity: 0.5; }
+    100% { opacity: 1; }
+}
+.title-emoji {
+    font-size: 2em;
+    margin-right: 10px;
+    vertical-align: middle;
+}
+/* Hide Gradio branding and footer elements */
+.gradio-container footer {
+    display: none !important;
+}
+/* Hide "Use via API" button */
+.gradio-container .api-docs {
+    display: none !important;
+}
+/* Hide "Built with Gradio" logo */
+.gradio-container .built-with {
+    display: none !important;
+}
+/* Hide Settings button */
+.gradio-container .settings {
+    display: none !important;
+}
+/* Hide the entire footer area */
+.gradio-container .footer {
+    display: none !important;
+}
+/* Alternative selectors for different Gradio versions */
+.gradio-container div[data-testid="footer"] {
+    display: none !important;
+}
+.gradio-container .gradio-footer {
+    display: none !important;
+}
+/* Hide any elements containing gradio branding text */
+.gradio-container a[href*="gradio"] {
+    display: none !important;
+}
+.gradio-container div:has-text("Built with") {
+    display: none !important;
+}
+.gradio-container div:has-text("Use via API") {
+    display: none !important;
+}
+/* Ultra-sleek header styling with modern design */
+.ultra-sleek-header {
+    text-align: center;
+    padding: 20px 20px 5px 20px;
+    margin: 0;
+    background: transparent;
+    border: none;
+    position: relative;
+        overflow: hidden;
+}
+.ultra-sleek-header::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    right: 0;
+    bottom: 0;
+    background: radial-gradient(ellipse at center top, rgba(99, 102, 241, 0.05) 0%, transparent 70%);
+    pointer-events: none;
+}
+.badge-icon {
+    font-size: 1rem;
+    animation: float 3s ease-in-out infinite;
+}
+@keyframes float {
+    0%,
+    100% {
+        transform: translateY(0px);
+    }
+    50% {
+        transform: translateY(-4px);
+    }
+}
+.hero-title {
+    font-size: 4rem;
+    font-weight: 800;
+    margin: 10px 0 5px 0;
+    line-height: 1.1;
+    letter-spacing: -0.03em;
+    position: relative;
+    z-index: 1;
+}
+.title-primary {
+    background: linear-gradient(45deg,
+                #4169E1 0%,
+                #8A2BE2 50%,
+                #E91E63 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    position: relative;
+}
+.title-primary::after {
+    content: '';
+    position: absolute;
+    bottom: -8px;
+    left: 0;
+    right: 0;
+    height: 4px;
+    background: linear-gradient(90deg, #6366f1, #8b5cf6, #d946ef);
+    border-radius: 2px;
+    opacity: 0.6;
+}
+.title-accent {
+    background: linear-gradient(135deg, #1e293b 0%, #0d1e35 100%);
+    -webkit-background-clip: text;
+    -webkit-text-fill-color: transparent;
+    background-clip: text;
+    position: relative;
+    margin-left: 12px;
+}
+.hero-subtitle {
+    font-size: 1.25rem;
+    color: #64748b;
+    font-weight: 400;
+    margin: 15px auto 20px auto;
+        line-height: 1.6;
+        max-width: 580px;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        gap: 12px;
+}
+.status-dot {
+    display: inline-block;
+    width: 8px;
+    height: 8px;
+    background: #10b981;
+    border-radius: 50%;
+    position: relative;
+    flex-shrink: 0;
+}
+.status-dot::before {
+    content: '';
+    position: absolute;
+    top: 0;
+    left: 0;
+    width: 8px;
+    height: 8px;
+    background: #10b981;
+    border-radius: 50%;
+    animation: ping 2s cubic-bezier(0, 0, 0.2, 1) infinite;
+        transform-origin: center;
+    }
+    @keyframes ping {
+        75%,
+        100% {
+            transform: scale(2);
+            opacity: 0;
+        }
+    }
+    .feature-pills {
+        display: flex;
+        justify-content: center;
+        gap: 12px;
+        margin-top: 20px;
+        flex-wrap: wrap;
+    }
+    .pill {
+        background: linear-gradient(135deg, rgba(255, 255, 255, 0.95), rgba(248, 250, 252, 0.9));
+        border: 1px solid rgba(148, 163, 184, 0.15);
+        border-radius: 25px;
+        padding: 6px 14px;
+        font-size: 0.8rem;
+        font-weight: 600;
+        color: #475569;
+        backdrop-filter: blur(12px);
+        transition: all 0.25s cubic-bezier(0.4, 0, 0.2, 1);
+        white-space: nowrap;
+        box-shadow: 0 1px 3px rgba(0, 0, 0, 0.05);
+        letter-spacing: 0.025em;
+    }
+    .pill:hover {
+        background: linear-gradient(135deg, rgba(99, 102, 241, 0.1), rgba(139, 92, 246, 0.05));
+        border-color: rgba(99, 102, 241, 0.25);
+        color: #6366f1;
+        transform: translateY(-1px) scale(1.02);
+        box-shadow: 0 4px 12px rgba(99, 102, 241, 0.15), 0 2px 4px rgba(0, 0, 0, 0.05);
+}
+/* Ensure proper vertical alignment for chat input row */
+.items-center {
+    display: flex !important;
+    align-items: center !important;
+    gap: 8px !important;
+}
+/* Responsive design for mobile */
+@media (max-width: 768px) {
+    .ultra-sleek-header {
+            padding: 15px 15px 20px 15px;
+        }
+        .hero-title {
+            font-size: 3rem;
+        }
+        .hero-subtitle {
+            font-size: 1.125rem;
+            padding: 0 15px;
+            flex-direction: column;
+            gap: 8px;
+        }
+        .feature-pills {
+            gap: 8px;
+            margin-top: 24px;
+        }
+        .pill {
+            font-size: 0.8rem;
+            padding: 6px 12px;
+        }
+    }
+@media (max-width: 480px) {
+    .hero-title {
+        font-size: 2.5rem;
+        line-height: 1.2;
+    }
+                                                                                                                                .title-accent {
+                                                                                                                                    margin-left: 8px;
+    }
+}
+.gradio-container .chatbot .message-wrap button[aria-label*="clear" i],
+.gradio-container .chatbot .message-wrap button[title*="clear" i] {
+    display: none !important;
+}
+/* Target any button with clear-related text content */
+button:contains("Clear"),
+button:contains("clear"),
+button:contains("CLEAR") {
+    display: none !important;
+}
+/* Target buttons in chat interfaces specifically */
+.chat-container button[aria-label*="clear" i],
+.chatbot button[aria-label*="clear" i],
+.message-wrap button[aria-label*="clear" i] {
+    display: none !important;
+}
+/* Hide buttons with specific SVG icons that might represent clear/delete */
+button svg[data-testid*="clear"],
+button svg[data-testid*="delete"],
+button svg[data-testid*="trash"] {
+    display: none !important;
+}
+/* Hide parent button if it contains clear-related SVG */
+button:has(svg[data-testid*="clear"]),
+button:has(svg[data-testid*="delete"]),
+button:has(svg[data-testid*="trash"]) {
+    display: none !important;
+}