Lenith / computer_control_helper.py
Tecnhotron
First
e78c9e1
# --- START OF FILE computer_control_helper.py ---
from PIL import Image, ImageDraw, ImageFont
import pyautogui
import mss # For screen capture
import json
import os
import time, datetime
def parse_json_safely(json_string: str) -> dict:
"""
Attempts to parse a JSON object (dictionary) from a string,
handling potential errors, markdown fences, and surrounding text.
It looks for the first '{' and the last '}' to extract the potential JSON part.
"""
if not isinstance(json_string, str):
print(f"Warning: Input is not a string, but type {type(json_string)}. Returning empty dict.")
return {}
try:
# --- Stage 1: Basic Cleaning and Markdown Fence Removal ---
# Remove leading/trailing whitespace
clean_string = json_string.strip()
# More robust markdown fence removal
# Check if ```json exists and find content after it
md_json_start = clean_string.find("```json")
if md_json_start != -1:
# Find the start of the actual JSON content after ```json
potential_start = md_json_start + 7 # Length of "```json"
# Handle optional newline after ```json
if potential_start < len(clean_string) and clean_string[potential_start] == '\n':
potential_start += 1
clean_string = clean_string[potential_start:]
# Find the last ``` and take content before it
md_end = clean_string.rfind("```")
if md_end != -1:
clean_string = clean_string[:md_end]
# Strip again after potential fence removal
clean_string = clean_string.strip()
if not clean_string:
# print("Warning: String is empty after cleaning markdown fences.")
# Avoid printing warning if original string wasn't just fences
if json_string.strip() and json_string.strip() != "```json```" and json_string.strip() != "```json\n```":
print("Warning: String is empty after cleaning markdown fences.")
return {}
# --- Stage 2: Find the JSON Object Boundaries ---
first_brace = clean_string.find('{')
last_brace = clean_string.rfind('}')
if first_brace == -1 or last_brace == -1 or last_brace < first_brace:
# print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
# print(f"Cleaned string: '{clean_string}'")
# Avoid printing warning if the original string clearly wasn't meant to be JSON
if '{' in json_string or '}' in json_string:
print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
print(f"Cleaned string snippet: '{clean_string[:100]}...{clean_string[-100:]}'") # Show snippet
return {}
# Extract the potential JSON substring
potential_json = clean_string[first_brace : last_brace + 1]
# --- Stage 3: Parse the Extracted Substring ---
parsed = json.loads(potential_json)
# --- Stage 4: Validate the Parsed Structure ---
if isinstance(parsed, dict):
return parsed
# Handle the case where Gemini (or other LLM) might wrap a dict in a list
elif isinstance(parsed, list):
if len(parsed) == 1 and isinstance(parsed[0], dict):
print("Warning: JSON was wrapped in a list, extracting the single dictionary.")
return parsed[0]
else:
# It's a list, but not the specific list-of-one-dict structure we handle
print(f"Warning: Parsed JSON is a list, not a dictionary or list-of-one-dict. Content: {parsed}")
return {} # Return empty dict as the function promises a dict
else:
# Parsed successfully, but it's not a dict or the handled list case
print(f"Warning: Parsed JSON is not a dictionary (type: {type(parsed)}). Content: {parsed}")
return {}
except json.JSONDecodeError as e:
print(f"Error parsing extracted JSON: {e}")
print(f"Extracted substring: '{potential_json}'") # Show the problematic substring
# print(f"Original string: '{json_string}'") # Optionally show original too
return {}
except Exception as e:
# Catch any other unexpected errors during the process
print(f"An unexpected error occurred during JSON extraction or parsing: {e}")
print(f"Original string: '{json_string}'")
return {}
def capture_screen() -> Image.Image | None:
"""Captures the primary monitor's screen and returns it as a PIL Image."""
try:
with mss.mss() as sct:
# Attempt to get the primary monitor; mss behavior can vary.
# monitor[0] is often the 'all monitors' view, monitor[1] the primary.
monitor_index = 1
if len(sct.monitors) <= monitor_index:
print(f"Warning: Monitor index {monitor_index} not found, using monitor 0.")
monitor_index = 0 # Fallback
monitor = sct.monitors[monitor_index]
sct_img = sct.grab(monitor)
img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
# Optional: Resize if needed, but higher res is better for Gemini
# img.thumbnail((1920, 1080), Image.Resampling.LANCZOS)
print(f"Screen captured ({img.width}x{img.height})")
return img
except Exception as e:
print(f"Error capturing screen: {e}")
return None
def draw_grid_overlay(img):
"""Draws a grid overlay on the image with 10 horizontal lines and proportional vertical lines."""
try:
# Make a copy of the image to avoid modifying the original
img_copy = img.copy()
draw = ImageDraw.Draw(img_copy)
# Get image dimensions
width, height = img_copy.size
# Try to load a font for coordinate labels - INCREASED SIZE
try:
font = ImageFont.truetype("arial.ttf", 24) # Increased from 14 to 24
except:
font = None
print("Warning: Could not load font for grid labels. Using default.")
# Create 10 horizontal lines with coordinate labels (0-1000)
h_spacing = height // 10
for i in range(1, 10):
y = i * h_spacing
# Draw the horizontal line
draw.line([(0, y), (width, y)], fill=(255, 0, 0), width=2) # Increased line width
# Add coordinate label - normalize to 0-1000 scale
y_norm = int((i / 10) * 1000)
label = f"{y_norm}"
# Draw label background for better visibility
if font:
# Make background larger and more visible
text_bbox = draw.textbbox((10, y-12), label, font=font)
# Expand the bbox slightly
expanded_bbox = (
text_bbox[0] - 5,
text_bbox[1] - 5,
text_bbox[2] + 5,
text_bbox[3] + 5
)
draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background
draw.text((10, y-12), label, fill=(255, 0, 0), font=font)
else:
draw.rectangle([(5, y-10), (45, y+10)], fill=(255, 255, 255, 220))
draw.text((5, y-7), label, fill=(255, 0, 0))
# Calculate number of vertical lines to maintain the same ratio
aspect_ratio = width / height
v_lines = int(10 * aspect_ratio)
v_spacing = width // v_lines
# Create vertical lines with coordinate labels (0-1000)
for i in range(1, v_lines):
x = i * v_spacing
# Draw the vertical line
draw.line([(x, 0), (x, height)], fill=(255, 0, 0), width=2) # Increased line width
# Add coordinate label - normalize to 0-1000 scale
x_norm = int((i / v_lines) * 1000)
label = f"{x_norm}"
# Draw label background for better visibility
if font:
# Make background larger and more visible
text_bbox = draw.textbbox((x+5, 10), label, font=font)
# Expand the bbox slightly
expanded_bbox = (
text_bbox[0] - 5,
text_bbox[1] - 5,
text_bbox[2] + 5,
text_bbox[3] + 5
)
draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background
draw.text((x+5, 10), label, fill=(255, 0, 0), font=font)
else:
draw.rectangle([(x+2, 5), (x+45, 25)], fill=(255, 255, 255, 220))
draw.text((x+2, 5), label, fill=(255, 0, 0))
return img_copy
except Exception as e:
print(f"Error drawing grid overlay: {e}")
return img # Return original image if there's an error
def perform_click(location_data: dict):
"""Calculates center of bounding box and performs a mouse click."""
if not location_data or "box_2d" not in location_data:
print("No valid location data found to perform click.")
return False
box = location_data.get("box_2d")
label = location_data.get("label", "Unknown Element") # Get label if available
# Validate box format
if not isinstance(box, list) or len(box) != 4:
print(f"Error: Invalid bounding box format received: {box}")
return False
# Get screen dimensions (important for coordinate translation)
try:
screen_width, screen_height = pyautogui.size()
# ADDED: Print detected screen size for verification
print(f"Detected screen dimensions: {screen_width}x{screen_height}")
except Exception as e:
print(f"Error getting screen size: {e}")
return False
# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
y_min_norm, x_min_norm, y_max_norm, x_max_norm = box
# Enhanced validation of normalized coordinates with detailed error messages
for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
if not isinstance(coord, (int, float)):
print(f"Error: {name} coordinate is not a number: {coord}")
return False
if coord < 0 or coord > 1000:
print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
return False
# Check for zero or negative-area boxes
if x_min_norm >= x_max_norm:
print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
return False
if y_min_norm >= y_max_norm:
print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
return False
# Check for extremely small boxes (potential errors)
if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
# Continue but with a warning
# --- Convert to absolute pixel coordinates using round() ---
# Explanation: norm_coord / 1000 gives the ratio (0.0 to 1.0)
# Multiply by screen dimension to get the pixel position.
# round() converts to the nearest integer pixel.
abs_x_min = round(x_min_norm / 1000 * screen_width)
abs_y_min = round(y_min_norm / 1000 * screen_height)
abs_x_max = round(x_max_norm / 1000 * screen_width)
abs_y_max = round(y_max_norm / 1000 * screen_height)
# --- Calculate center point with additional checks ---
# Check for boundary anomalies
if abs_x_max - abs_x_min < 2:
print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
if abs_y_max - abs_y_min < 2:
print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")
# Calculate center with floating point precision, then round at the end
center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)
print(f"Identified '{label}' at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
print(f"Calculated click center: ({center_x}, {center_y})")
# --- Enhanced Safety Checks ---
# Add margin from edge of screen (5 pixels)
SCREEN_MARGIN = 5
if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
print(f"Warning: Click X-coordinate ({center_x}) is very close to screen edge")
# Adjust to safe zone if needed
center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
print(f"Adjusted X-coordinate to: {center_x}")
if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
print(f"Warning: Click Y-coordinate ({center_y}) is very close to screen edge")
# Adjust to safe zone if needed
center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
print(f"Adjusted Y-coordinate to: {center_y}")
# Extra check for valid range
if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
print(f"Error: Calculated click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
# Add more info if possible
print(f"Derived from normalized box: {box}")
return False
# --- Perform Action with improved reliability ---
try:
print(f"Moving mouse to ({center_x}, {center_y}) and clicking '{label}'...")
# First move to a position near the target to avoid potential edge-case issues
current_x, current_y = pyautogui.position()
if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
intermediate_x = current_x + (center_x - current_x) // 2
intermediate_y = current_y + (center_y - current_y) // 2
pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)
# Then move to the exact target with slightly slower movement for precision
pyautogui.moveTo(center_x, center_y, duration=0.3)
# Brief pause before clicking
time.sleep(0.1)
# Click
pyautogui.click()
print("Click performed.")
# Verify cursor position after click
after_x, after_y = pyautogui.position()
if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
print(f"Warning: Cursor position after click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")
return True
except Exception as e:
print(f"Error during mouse action: {e}")
return False
def perform_double_click(location_data: dict):
"""Calculates center of bounding box and performs a mouse double-click."""
if not location_data or "box_2d" not in location_data:
print("No valid location data found to perform double-click.")
return False
box = location_data.get("box_2d")
label = location_data.get("label", "Unknown Element") # Get label if available
# Validate box format
if not isinstance(box, list) or len(box) != 4:
print(f"Error: Invalid bounding box format received: {box}")
return False
# Get screen dimensions (important for coordinate translation)
try:
screen_width, screen_height = pyautogui.size()
# ADDED: Print detected screen size for verification
print(f"Detected screen dimensions: {screen_width}x{screen_height}")
except Exception as e:
print(f"Error getting screen size: {e}")
return False
# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
y_min_norm, x_min_norm, y_max_norm, x_max_norm = box
# Enhanced validation of normalized coordinates with detailed error messages
for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
if not isinstance(coord, (int, float)):
print(f"Error: {name} coordinate is not a number: {coord}")
return False
if coord < 0 or coord > 1000:
print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
return False
# Check for zero or negative-area boxes
if x_min_norm >= x_max_norm:
print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
return False
if y_min_norm >= y_max_norm:
print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
return False
# Check for extremely small boxes (potential errors)
if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
# Continue but with a warning
# Convert to absolute pixel coordinates using round()
abs_x_min = round(x_min_norm / 1000 * screen_width)
abs_y_min = round(y_min_norm / 1000 * screen_height)
abs_x_max = round(x_max_norm / 1000 * screen_width)
abs_y_max = round(y_max_norm / 1000 * screen_height)
# --- Calculate center point with additional checks ---
# Check for boundary anomalies
if abs_x_max - abs_x_min < 2:
print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
if abs_y_max - abs_y_min < 2:
print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")
# Calculate center with floating point precision, then round at the end
center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)
print(f"Identified '{label}' for double-click at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
print(f"Calculated double-click center: ({center_x}, {center_y})")
# --- Enhanced Safety Checks ---
# Add margin from edge of screen (5 pixels)
SCREEN_MARGIN = 5
if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
print(f"Warning: Double-click X-coordinate ({center_x}) is very close to screen edge")
# Adjust to safe zone if needed
center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
print(f"Adjusted X-coordinate to: {center_x}")
if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
print(f"Warning: Double-click Y-coordinate ({center_y}) is very close to screen edge")
# Adjust to safe zone if needed
center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
print(f"Adjusted Y-coordinate to: {center_y}")
# Extra check for valid range
if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
print(f"Error: Calculated double-click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
print(f"Derived from normalized box: {box}")
return False
# --- Perform Action with improved reliability ---
try:
print(f"Moving mouse to ({center_x}, {center_y}) and double-clicking '{label}'...")
# First move to a position near the target to avoid potential edge-case issues
current_x, current_y = pyautogui.position()
if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
intermediate_x = current_x + (center_x - current_x) // 2
intermediate_y = current_y + (center_y - current_y) // 2
pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)
# Then move to the exact target with slightly slower movement for precision
pyautogui.moveTo(center_x, center_y, duration=0.3)
# Brief pause before clicking
time.sleep(0.1)
# Double click
pyautogui.doubleClick()
print("Double-click performed.")
# Verify cursor position after click
after_x, after_y = pyautogui.position()
if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
print(f"Warning: Cursor position after double-click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")
return True
except Exception as e:
print(f"Error during mouse action: {e}")
return False
def perform_type(location_data: dict):
"""Clicks the specified location (if provided) and types the given text."""
if not location_data:
print("No valid location data found for typing.")
return False
text_to_type = location_data.get("text_to_type")
label = location_data.get("label", "Typing Action") # Get label if available
if not text_to_type:
print(f"Error: 'text_to_type' missing in data for typing action: {location_data}")
return False
# --- Step 1: Click the target field (if box_2d is provided) ---
# Reuse perform_click logic to activate the field
if "box_2d" in location_data:
print(f"Clicking field '{label}' before typing...")
click_successful = perform_click(location_data) # Use the existing click function
if not click_successful:
print(f"Failed to click field '{label}' before typing. Aborting type action.")
return False
time.sleep(0.2) # Small delay after click before typing
else:
print("Warning: No 'box_2d' provided for 'type' action. Typing at current cursor position.")
# Consider if this is desired behavior or should be an error
# --- Step 2: Perform Typing ---
try:
print(f"Typing text into '{label}': '{text_to_type}'")
# Add a small interval between key presses for reliability, especially in slower apps
pyautogui.write(text_to_type, interval=0.05)
print("Typing performed.")
return True
except Exception as e:
print(f"Error during typing action: {e}")
return False
def perform_press_key(location_data: dict):
"""
Presses a specific keyboard key or performs a keyboard shortcut.
Handles single keys (e.g., 'enter', 'a', 'f5') using pyautogui.press()
and shortcuts (e.g., 'ctrl+c', 'alt+tab', 'ctrl+alt+delete') using
pyautogui.hotkey(). Shortcuts are identified by the presence of '+'
in the 'key_to_press' string.
Args:
location_data: A dictionary containing action details. Expected keys:
- 'key_to_press' (str): The key or shortcut string (e.g., 'enter', 'ctrl+c').
- 'label' (str, optional): A descriptive label for the action.
Returns:
bool: True if the action was performed successfully, False otherwise.
Raises:
Potentially any exception from pyautogui if the key/shortcut string is
malformed or contains names pyautogui does not recognize.
"""
if not location_data:
print("No valid location data found for key press/shortcut.")
return False
key_action_string = location_data.get("key_to_press")
# Use the provided label or create a default one based on the action string
label = location_data.get("label", f"Key Action '{key_action_string}'")
if not key_action_string:
print(f"Error: 'key_to_press' missing in data for key action: {location_data}")
return False
# Standardize to lowercase for execution consistency
key_action_string_lower = key_action_string.lower()
is_shortcut = '+' in key_action_string_lower
# --- Perform Key Press or Shortcut ---
try:
if is_shortcut:
# Prepare the list of keys for hotkey
action_keys = [key.strip() for key in key_action_string_lower.split('+')]
# Basic validation for empty components after split
if not action_keys or any(not k for k in action_keys):
print(f"Error: Invalid shortcut format '{key_action_string}'. Check for extra '+' or empty parts.")
return False
print(f"Performing shortcut: '{key_action_string}' (Action label: '{label}')")
pyautogui.hotkey(*action_keys) # Unpack the list into arguments
print("Shortcut performed.")
else:
# Single key press
# Basic validation for empty key string
if not key_action_string_lower.strip():
print(f"Error: 'key_to_press' is empty or whitespace only.")
return False
print(f"Pressing key: '{key_action_string}' (Action label: '{label}')")
pyautogui.press(key_action_string_lower)
print("Key press performed.")
# Optional: Add a small delay after the action
# time.sleep(0.1)
return True
except Exception as e:
# Catch potential errors from pyautogui (e.g., invalid key names)
# PyAutoGUI might raise various errors, including KeyError or platform-specific ones
print(f"Error during key action '{key_action_string}': {e}")
# You might want to inspect the type of exception 'e' for more specific handling
return False
max_steps = 100 # Safety break to prevent infinite loops
def do_task(original_task, task_not_complted):
print("IN do_task()")
step_count = 0
try:
while not task_not_complted and step_count < max_steps:
step_count += 1
print(f"\n--- Step {step_count} for Task: '{original_task}' ---")
# 1. Capture Screen
print("Capturing screen...")
screen_image = capture_screen()
# Parse the json input
step_info = parse_json_safely(original_task)
# 3. Extract info from Gemini's response
action_type = step_info.get("action_type") # Get the action type
task_not_completed = step_info.get("task_not_completed", False) # Default to False if missing
label = step_info.get("label", "Unknown Action/Element") # Get label
print(f"Gemini identified action: '{action_type}', Target/Label: '{label}'. Task completed this step: {not task_not_completed}")
# 4. Perform Action based on type - only support click and double_click
action_successful = False # Initialize before action attempt
if action_type == "click":
action_successful = perform_click(step_info)
elif action_type == "double_click":
action_successful = perform_double_click(step_info)
elif action_type not in ["click", "double_click"]:
# Handle invalid or unsupported action types (keyboard actions should go to function model)
print(f"Error: Action type '{action_type}' is not supported. Only 'click' and 'double_click' are allowed. Keyboard actions should be handled by the function calling model.")
action_successful = False
else:
# This case handles missing action_type
print(f"Error: Unknown or missing 'action_type' ('{action_type}') received.")
action_successful = False
# 5. Check if action failed and abort if necessary
if not action_successful:
print(f"Failed to perform action '{action_type}' on '{label}'. Aborting task.")
break # Exit the loop on failure
# 6. Check if task is finished based on Gemini's flag
if not task_not_completed:
print(f"Task '{original_task}' reported as complete after action '{action_type}'.")
break # Exit the loop, task is done
return "STEP COMPLETED. CHECK BY YOURSELF IF IT HAS ACTUALLY BEEN DONE OR NOT. DO NOT ASK THE USER - JUST VIEW THE RECORDING."
except Exception as e:
print(f"\n--- An unexpected error occurred in the main loop ---")
print(f"Error: {e}")
import traceback
traceback.print_exc() # Print detailed traceback for debugging
print("-------------------------------------------------------")