import os
import time
import base64
import aiohttp
import asyncio
import logging
from typing import Optional, Dict, Any, Union
from PIL import Image
from agentpress.tool import Tool, ToolResult, openapi_schema, xml_schema
from sandbox.sandbox import SandboxToolsBase, Sandbox
KEYBOARD_KEYS = [
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9',
'enter', 'esc', 'backspace', 'tab', 'space', 'delete',
'ctrl', 'alt', 'shift', 'win',
'up', 'down', 'left', 'right',
'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12',
'ctrl+c', 'ctrl+v', 'ctrl+x', 'ctrl+z', 'ctrl+a', 'ctrl+s',
'alt+tab', 'alt+f4', 'ctrl+alt+delete'
]
class ComputerUseTool(SandboxToolsBase):
"""Computer automation tool for controlling the sandbox browser and GUI."""
def __init__(self, sandbox: Sandbox):
"""Initialize automation tool with sandbox connection."""
super().__init__(sandbox)
self.session = None
self.mouse_x = 0 # Track current mouse position
self.mouse_y = 0
# Get automation service URL using port 8000
self.api_base_url = self.sandbox.get_preview_link(8000)
logging.info(f"Initialized Computer Use Tool with API URL: {self.api_base_url}")
async def _get_session(self) -> aiohttp.ClientSession:
"""Get or create aiohttp session for API requests."""
if self.session is None or self.session.closed:
self.session = aiohttp.ClientSession()
return self.session
async def _api_request(self, method: str, endpoint: str, data: Optional[Dict] = None) -> Dict:
"""Send request to automation service API."""
try:
session = await self._get_session()
url = f"{self.api_base_url}/api{endpoint}"
logging.debug(f"API request: {method} {url} {data}")
if method.upper() == "GET":
async with session.get(url) as response:
result = await response.json()
else: # POST
async with session.post(url, json=data) as response:
result = await response.json()
logging.debug(f"API response: {result}")
return result
except Exception as e:
logging.error(f"API request failed: {str(e)}")
return {"success": False, "error": str(e)}
async def cleanup(self):
"""Clean up resources."""
if self.session and not self.session.closed:
await self.session.close()
self.session = None
@openapi_schema({
"type": "function",
"function": {
"name": "move_to",
"description": "Move cursor to specified position",
"parameters": {
"type": "object",
"properties": {
"x": {
"type": "number",
"description": "X coordinate"
},
"y": {
"type": "number",
"description": "Y coordinate"
}
},
"required": ["x", "y"]
}
}
})
@xml_schema(
tag_name="move-to",
mappings=[
{"param_name": "x", "node_type": "attribute", "path": "."},
{"param_name": "y", "node_type": "attribute", "path": "."}
],
example='''
'''
)
async def move_to(self, x: float, y: float) -> ToolResult:
"""Move cursor to specified position."""
try:
x_int = int(round(float(x)))
y_int = int(round(float(y)))
result = await self._api_request("POST", "/automation/mouse/move", {
"x": x_int,
"y": y_int
})
if result.get("success", False):
self.mouse_x = x_int
self.mouse_y = y_int
return ToolResult(success=True, output=f"Moved to ({x_int}, {y_int})")
else:
return ToolResult(success=False, output=f"Failed to move: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to move: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "click",
"description": "Click at current or specified position",
"parameters": {
"type": "object",
"properties": {
"button": {
"type": "string",
"description": "Mouse button to click",
"enum": ["left", "right", "middle"],
"default": "left"
},
"x": {
"type": "number",
"description": "Optional X coordinate"
},
"y": {
"type": "number",
"description": "Optional Y coordinate"
},
"num_clicks": {
"type": "integer",
"description": "Number of clicks",
"enum": [1, 2, 3],
"default": 1
}
}
}
}
})
@xml_schema(
tag_name="click",
mappings=[
{"param_name": "x", "node_type": "attribute", "path": "x"},
{"param_name": "y", "node_type": "attribute", "path": "y"},
{"param_name": "button", "node_type": "attribute", "path": "button"},
{"param_name": "num_clicks", "node_type": "attribute", "path": "num_clicks"}
],
example='''
'''
)
async def click(self, x: Optional[float] = None, y: Optional[float] = None,
button: str = "left", num_clicks: int = 1) -> ToolResult:
"""Click at current or specified position."""
try:
x_val = x if x is not None else self.mouse_x
y_val = y if y is not None else self.mouse_y
x_int = int(round(float(x_val)))
y_int = int(round(float(y_val)))
num_clicks = int(num_clicks)
result = await self._api_request("POST", "/automation/mouse/click", {
"x": x_int,
"y": y_int,
"clicks": num_clicks,
"button": button.lower()
})
if result.get("success", False):
self.mouse_x = x_int
self.mouse_y = y_int
return ToolResult(success=True,
output=f"{num_clicks} {button} click(s) performed at ({x_int}, {y_int})")
else:
return ToolResult(success=False, output=f"Failed to click: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to click: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "scroll",
"description": "Scroll the mouse wheel at current position",
"parameters": {
"type": "object",
"properties": {
"amount": {
"type": "integer",
"description": "Scroll amount (positive for up, negative for down)",
"minimum": -10,
"maximum": 10
}
},
"required": ["amount"]
}
}
})
@xml_schema(
tag_name="scroll",
mappings=[
{"param_name": "amount", "node_type": "attribute", "path": "amount"}
],
example='''
'''
)
async def scroll(self, amount: int) -> ToolResult:
"""
Scroll the mouse wheel at current position.
Positive values scroll up, negative values scroll down.
"""
try:
amount = int(float(amount))
amount = max(-10, min(10, amount))
result = await self._api_request("POST", "/automation/mouse/scroll", {
"clicks": amount,
"x": self.mouse_x,
"y": self.mouse_y
})
if result.get("success", False):
direction = "up" if amount > 0 else "down"
steps = abs(amount)
return ToolResult(success=True,
output=f"Scrolled {direction} {steps} step(s) at position ({self.mouse_x}, {self.mouse_y})")
else:
return ToolResult(success=False, output=f"Failed to scroll: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to scroll: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "typing",
"description": "Type specified text",
"parameters": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "Text to type"
}
},
"required": ["text"]
}
}
})
@xml_schema(
tag_name="typing",
mappings=[
{"param_name": "text", "node_type": "content", "path": "text"}
],
example='''
Hello World!
'''
)
async def typing(self, text: str) -> ToolResult:
"""Type specified text."""
try:
text = str(text)
result = await self._api_request("POST", "/automation/keyboard/write", {
"message": text,
"interval": 0.01
})
if result.get("success", False):
return ToolResult(success=True, output=f"Typed: {text}")
else:
return ToolResult(success=False, output=f"Failed to type: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to type: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "press",
"description": "Press and release a key",
"parameters": {
"type": "object",
"properties": {
"key": {
"type": "string",
"description": "Key to press",
"enum": KEYBOARD_KEYS
}
},
"required": ["key"]
}
}
})
@xml_schema(
tag_name="press",
mappings=[
{"param_name": "key", "node_type": "attribute", "path": "key"}
],
example='''
'''
)
async def press(self, key: str) -> ToolResult:
"""Press and release a key."""
try:
key = str(key).lower()
result = await self._api_request("POST", "/automation/keyboard/press", {
"keys": key,
"presses": 1
})
if result.get("success", False):
return ToolResult(success=True, output=f"Pressed key: {key}")
else:
return ToolResult(success=False, output=f"Failed to press key: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to press key: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "wait",
"description": "Wait for specified duration",
"parameters": {
"type": "object",
"properties": {
"duration": {
"type": "number",
"description": "Duration in seconds",
"default": 0.5
}
}
}
}
})
@xml_schema(
tag_name="wait",
mappings=[
{"param_name": "duration", "node_type": "attribute", "path": "duration"}
],
example='''
'''
)
async def wait(self, duration: float = 0.5) -> ToolResult:
"""Wait for specified duration."""
try:
duration = float(duration)
duration = max(0, min(10, duration))
await asyncio.sleep(duration)
return ToolResult(success=True, output=f"Waited {duration} seconds")
except Exception as e:
return ToolResult(success=False, output=f"Failed to wait: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "mouse_down",
"description": "Press a mouse button",
"parameters": {
"type": "object",
"properties": {
"button": {
"type": "string",
"description": "Mouse button to press",
"enum": ["left", "right", "middle"],
"default": "left"
}
}
}
}
})
@xml_schema(
tag_name="mouse-down",
mappings=[
{"param_name": "button", "node_type": "attribute", "path": "button"}
],
example='''
'''
)
async def mouse_down(self, button: str = "left", x: Optional[float] = None, y: Optional[float] = None) -> ToolResult:
"""Press a mouse button at current or specified position."""
try:
x_val = x if x is not None else self.mouse_x
y_val = y if y is not None else self.mouse_y
x_int = int(round(float(x_val)))
y_int = int(round(float(y_val)))
result = await self._api_request("POST", "/automation/mouse/down", {
"x": x_int,
"y": y_int,
"button": button.lower()
})
if result.get("success", False):
self.mouse_x = x_int
self.mouse_y = y_int
return ToolResult(success=True, output=f"{button} button pressed at ({x_int}, {y_int})")
else:
return ToolResult(success=False, output=f"Failed to press button: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to press button: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "mouse_up",
"description": "Release a mouse button",
"parameters": {
"type": "object",
"properties": {
"button": {
"type": "string",
"description": "Mouse button to release",
"enum": ["left", "right", "middle"],
"default": "left"
}
}
}
}
})
@xml_schema(
tag_name="mouse-up",
mappings=[
{"param_name": "button", "node_type": "attribute", "path": "button"}
],
example='''
'''
)
async def mouse_up(self, button: str = "left", x: Optional[float] = None, y: Optional[float] = None) -> ToolResult:
"""Release a mouse button at current or specified position."""
try:
x_val = x if x is not None else self.mouse_x
y_val = y if y is not None else self.mouse_y
x_int = int(round(float(x_val)))
y_int = int(round(float(y_val)))
result = await self._api_request("POST", "/automation/mouse/up", {
"x": x_int,
"y": y_int,
"button": button.lower()
})
if result.get("success", False):
self.mouse_x = x_int
self.mouse_y = y_int
return ToolResult(success=True, output=f"{button} button released at ({x_int}, {y_int})")
else:
return ToolResult(success=False, output=f"Failed to release button: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to release button: {str(e)}")
@openapi_schema({
"type": "function",
"function": {
"name": "drag_to",
"description": "Drag cursor to specified position",
"parameters": {
"type": "object",
"properties": {
"x": {
"type": "number",
"description": "Target X coordinate"
},
"y": {
"type": "number",
"description": "Target Y coordinate"
}
},
"required": ["x", "y"]
}
}
})
@xml_schema(
tag_name="drag-to",
mappings=[
{"param_name": "x", "node_type": "attribute", "path": "x"},
{"param_name": "y", "node_type": "attribute", "path": "y"}
],
example='''
'''
)
async def drag_to(self, x: float, y: float) -> ToolResult:
"""Click and drag from current position to target position."""
try:
target_x = int(round(float(x)))
target_y = int(round(float(y)))
start_x = self.mouse_x
start_y = self.mouse_y
result = await self._api_request("POST", "/automation/mouse/drag", {
"x": target_x,
"y": target_y,
"duration": 0.3,
"button": "left"
})
if result.get("success", False):
self.mouse_x = target_x
self.mouse_y = target_y
return ToolResult(success=True,
output=f"Dragged from ({start_x}, {start_y}) to ({target_x}, {target_y})")
else:
return ToolResult(success=False, output=f"Failed to drag: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to drag: {str(e)}")
async def get_screenshot_base64(self) -> Optional[dict]:
"""Capture screen and return as base64 encoded image."""
try:
result = await self._api_request("POST", "/automation/screenshot")
if "image" in result:
base64_str = result["image"]
timestamp = time.strftime("%Y%m%d_%H%M%S")
# Save screenshot to file
screenshots_dir = "screenshots"
if not os.path.exists(screenshots_dir):
os.makedirs(screenshots_dir)
timestamped_filename = os.path.join(screenshots_dir, f"screenshot_{timestamp}.png")
latest_filename = "latest_screenshot.png"
# Decode base64 string and save to file
img_data = base64.b64decode(base64_str)
with open(timestamped_filename, 'wb') as f:
f.write(img_data)
# Save a copy as the latest screenshot
with open(latest_filename, 'wb') as f:
f.write(img_data)
return {
"content_type": "image/png",
"base64": base64_str,
"timestamp": timestamp,
"filename": timestamped_filename
}
else:
return None
except Exception as e:
print(f"[Screenshot] Error during screenshot process: {str(e)}")
return None
@openapi_schema({
"type": "function",
"function": {
"name": "hotkey",
"description": "Press a key combination",
"parameters": {
"type": "object",
"properties": {
"keys": {
"type": "string",
"description": "Key combination to press",
"enum": KEYBOARD_KEYS
}
},
"required": ["keys"]
}
}
})
@xml_schema(
tag_name="hotkey",
mappings=[
{"param_name": "keys", "node_type": "attribute", "path": "keys"}
],
example='''
'''
)
async def hotkey(self, keys: str) -> ToolResult:
"""Press a key combination."""
try:
keys = str(keys).lower().strip()
key_sequence = keys.split('+')
result = await self._api_request("POST", "/automation/keyboard/hotkey", {
"keys": key_sequence,
"interval": 0.01
})
if result.get("success", False):
return ToolResult(success=True, output=f"Pressed key combination: {keys}")
else:
return ToolResult(success=False, output=f"Failed to press keys: {result.get('error', 'Unknown error')}")
except Exception as e:
return ToolResult(success=False, output=f"Failed to press keys: {str(e)}")
if __name__ == "__main__":
print("This module should be imported, not run directly.")