# --- START OF FILE computer_control_helper.py --- from PIL import Image, ImageDraw, ImageFont import pyautogui import mss # For screen capture import json import os import time, datetime def parse_json_safely(json_string: str) -> dict: """ Attempts to parse a JSON object (dictionary) from a string, handling potential errors, markdown fences, and surrounding text. It looks for the first '{' and the last '}' to extract the potential JSON part. """ if not isinstance(json_string, str): print(f"Warning: Input is not a string, but type {type(json_string)}. Returning empty dict.") return {} try: # --- Stage 1: Basic Cleaning and Markdown Fence Removal --- # Remove leading/trailing whitespace clean_string = json_string.strip() # More robust markdown fence removal # Check if ```json exists and find content after it md_json_start = clean_string.find("```json") if md_json_start != -1: # Find the start of the actual JSON content after ```json potential_start = md_json_start + 7 # Length of "```json" # Handle optional newline after ```json if potential_start < len(clean_string) and clean_string[potential_start] == '\n': potential_start += 1 clean_string = clean_string[potential_start:] # Find the last ``` and take content before it md_end = clean_string.rfind("```") if md_end != -1: clean_string = clean_string[:md_end] # Strip again after potential fence removal clean_string = clean_string.strip() if not clean_string: # print("Warning: String is empty after cleaning markdown fences.") # Avoid printing warning if original string wasn't just fences if json_string.strip() and json_string.strip() != "```json```" and json_string.strip() != "```json\n```": print("Warning: String is empty after cleaning markdown fences.") return {} # --- Stage 2: Find the JSON Object Boundaries --- first_brace = clean_string.find('{') last_brace = clean_string.rfind('}') if first_brace == -1 or last_brace == -1 or last_brace < first_brace: # print(f"Warning: Could not find valid {{...}} structure in the cleaned string.") # print(f"Cleaned string: '{clean_string}'") # Avoid printing warning if the original string clearly wasn't meant to be JSON if '{' in json_string or '}' in json_string: print(f"Warning: Could not find valid {{...}} structure in the cleaned string.") print(f"Cleaned string snippet: '{clean_string[:100]}...{clean_string[-100:]}'") # Show snippet return {} # Extract the potential JSON substring potential_json = clean_string[first_brace : last_brace + 1] # --- Stage 3: Parse the Extracted Substring --- parsed = json.loads(potential_json) # --- Stage 4: Validate the Parsed Structure --- if isinstance(parsed, dict): return parsed # Handle the case where Gemini (or other LLM) might wrap a dict in a list elif isinstance(parsed, list): if len(parsed) == 1 and isinstance(parsed[0], dict): print("Warning: JSON was wrapped in a list, extracting the single dictionary.") return parsed[0] else: # It's a list, but not the specific list-of-one-dict structure we handle print(f"Warning: Parsed JSON is a list, not a dictionary or list-of-one-dict. Content: {parsed}") return {} # Return empty dict as the function promises a dict else: # Parsed successfully, but it's not a dict or the handled list case print(f"Warning: Parsed JSON is not a dictionary (type: {type(parsed)}). Content: {parsed}") return {} except json.JSONDecodeError as e: print(f"Error parsing extracted JSON: {e}") print(f"Extracted substring: '{potential_json}'") # Show the problematic substring # print(f"Original string: '{json_string}'") # Optionally show original too return {} except Exception as e: # Catch any other unexpected errors during the process print(f"An unexpected error occurred during JSON extraction or parsing: {e}") print(f"Original string: '{json_string}'") return {} def capture_screen() -> Image.Image | None: """Captures the primary monitor's screen and returns it as a PIL Image.""" try: with mss.mss() as sct: # Attempt to get the primary monitor; mss behavior can vary. # monitor[0] is often the 'all monitors' view, monitor[1] the primary. monitor_index = 1 if len(sct.monitors) <= monitor_index: print(f"Warning: Monitor index {monitor_index} not found, using monitor 0.") monitor_index = 0 # Fallback monitor = sct.monitors[monitor_index] sct_img = sct.grab(monitor) img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") # Optional: Resize if needed, but higher res is better for Gemini # img.thumbnail((1920, 1080), Image.Resampling.LANCZOS) print(f"Screen captured ({img.width}x{img.height})") return img except Exception as e: print(f"Error capturing screen: {e}") return None def draw_grid_overlay(img): """Draws a grid overlay on the image with 10 horizontal lines and proportional vertical lines.""" try: # Make a copy of the image to avoid modifying the original img_copy = img.copy() draw = ImageDraw.Draw(img_copy) # Get image dimensions width, height = img_copy.size # Try to load a font for coordinate labels - INCREASED SIZE try: font = ImageFont.truetype("arial.ttf", 24) # Increased from 14 to 24 except: font = None print("Warning: Could not load font for grid labels. Using default.") # Create 10 horizontal lines with coordinate labels (0-1000) h_spacing = height // 10 for i in range(1, 10): y = i * h_spacing # Draw the horizontal line draw.line([(0, y), (width, y)], fill=(255, 0, 0), width=2) # Increased line width # Add coordinate label - normalize to 0-1000 scale y_norm = int((i / 10) * 1000) label = f"{y_norm}" # Draw label background for better visibility if font: # Make background larger and more visible text_bbox = draw.textbbox((10, y-12), label, font=font) # Expand the bbox slightly expanded_bbox = ( text_bbox[0] - 5, text_bbox[1] - 5, text_bbox[2] + 5, text_bbox[3] + 5 ) draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background draw.text((10, y-12), label, fill=(255, 0, 0), font=font) else: draw.rectangle([(5, y-10), (45, y+10)], fill=(255, 255, 255, 220)) draw.text((5, y-7), label, fill=(255, 0, 0)) # Calculate number of vertical lines to maintain the same ratio aspect_ratio = width / height v_lines = int(10 * aspect_ratio) v_spacing = width // v_lines # Create vertical lines with coordinate labels (0-1000) for i in range(1, v_lines): x = i * v_spacing # Draw the vertical line draw.line([(x, 0), (x, height)], fill=(255, 0, 0), width=2) # Increased line width # Add coordinate label - normalize to 0-1000 scale x_norm = int((i / v_lines) * 1000) label = f"{x_norm}" # Draw label background for better visibility if font: # Make background larger and more visible text_bbox = draw.textbbox((x+5, 10), label, font=font) # Expand the bbox slightly expanded_bbox = ( text_bbox[0] - 5, text_bbox[1] - 5, text_bbox[2] + 5, text_bbox[3] + 5 ) draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background draw.text((x+5, 10), label, fill=(255, 0, 0), font=font) else: draw.rectangle([(x+2, 5), (x+45, 25)], fill=(255, 255, 255, 220)) draw.text((x+2, 5), label, fill=(255, 0, 0)) return img_copy except Exception as e: print(f"Error drawing grid overlay: {e}") return img # Return original image if there's an error def perform_click(location_data: dict): """Calculates center of bounding box and performs a mouse click.""" if not location_data or "box_2d" not in location_data: print("No valid location data found to perform click.") return False box = location_data.get("box_2d") label = location_data.get("label", "Unknown Element") # Get label if available # Validate box format if not isinstance(box, list) or len(box) != 4: print(f"Error: Invalid bounding box format received: {box}") return False # Get screen dimensions (important for coordinate translation) try: screen_width, screen_height = pyautogui.size() # ADDED: Print detected screen size for verification print(f"Detected screen dimensions: {screen_width}x{screen_height}") except Exception as e: print(f"Error getting screen size: {e}") return False # Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000) y_min_norm, x_min_norm, y_max_norm, x_max_norm = box # Enhanced validation of normalized coordinates with detailed error messages for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])): if not isinstance(coord, (int, float)): print(f"Error: {name} coordinate is not a number: {coord}") return False if coord < 0 or coord > 1000: print(f"Error: {name} coordinate out of range [0, 1000]: {coord}") return False # Check for zero or negative-area boxes if x_min_norm >= x_max_norm: print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}") return False if y_min_norm >= y_max_norm: print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}") return False # Check for extremely small boxes (potential errors) if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5: print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.") # Continue but with a warning # --- Convert to absolute pixel coordinates using round() --- # Explanation: norm_coord / 1000 gives the ratio (0.0 to 1.0) # Multiply by screen dimension to get the pixel position. # round() converts to the nearest integer pixel. abs_x_min = round(x_min_norm / 1000 * screen_width) abs_y_min = round(y_min_norm / 1000 * screen_height) abs_x_max = round(x_max_norm / 1000 * screen_width) abs_y_max = round(y_max_norm / 1000 * screen_height) # --- Calculate center point with additional checks --- # Check for boundary anomalies if abs_x_max - abs_x_min < 2: print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise") if abs_y_max - abs_y_min < 2: print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise") # Calculate center with floating point precision, then round at the end center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2) center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2) print(f"Identified '{label}' at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]") print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]") print(f"Calculated click center: ({center_x}, {center_y})") # --- Enhanced Safety Checks --- # Add margin from edge of screen (5 pixels) SCREEN_MARGIN = 5 if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN: print(f"Warning: Click X-coordinate ({center_x}) is very close to screen edge") # Adjust to safe zone if needed center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1)) print(f"Adjusted X-coordinate to: {center_x}") if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN: print(f"Warning: Click Y-coordinate ({center_y}) is very close to screen edge") # Adjust to safe zone if needed center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1)) print(f"Adjusted Y-coordinate to: {center_y}") # Extra check for valid range if not (0 <= center_x < screen_width and 0 <= center_y < screen_height): print(f"Error: Calculated click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!") # Add more info if possible print(f"Derived from normalized box: {box}") return False # --- Perform Action with improved reliability --- try: print(f"Moving mouse to ({center_x}, {center_y}) and clicking '{label}'...") # First move to a position near the target to avoid potential edge-case issues current_x, current_y = pyautogui.position() if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100: intermediate_x = current_x + (center_x - current_x) // 2 intermediate_y = current_y + (center_y - current_y) // 2 pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1) # Then move to the exact target with slightly slower movement for precision pyautogui.moveTo(center_x, center_y, duration=0.3) # Brief pause before clicking time.sleep(0.1) # Click pyautogui.click() print("Click performed.") # Verify cursor position after click after_x, after_y = pyautogui.position() if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5: print(f"Warning: Cursor position after click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})") return True except Exception as e: print(f"Error during mouse action: {e}") return False def perform_double_click(location_data: dict): """Calculates center of bounding box and performs a mouse double-click.""" if not location_data or "box_2d" not in location_data: print("No valid location data found to perform double-click.") return False box = location_data.get("box_2d") label = location_data.get("label", "Unknown Element") # Get label if available # Validate box format if not isinstance(box, list) or len(box) != 4: print(f"Error: Invalid bounding box format received: {box}") return False # Get screen dimensions (important for coordinate translation) try: screen_width, screen_height = pyautogui.size() # ADDED: Print detected screen size for verification print(f"Detected screen dimensions: {screen_width}x{screen_height}") except Exception as e: print(f"Error getting screen size: {e}") return False # Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000) y_min_norm, x_min_norm, y_max_norm, x_max_norm = box # Enhanced validation of normalized coordinates with detailed error messages for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])): if not isinstance(coord, (int, float)): print(f"Error: {name} coordinate is not a number: {coord}") return False if coord < 0 or coord > 1000: print(f"Error: {name} coordinate out of range [0, 1000]: {coord}") return False # Check for zero or negative-area boxes if x_min_norm >= x_max_norm: print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}") return False if y_min_norm >= y_max_norm: print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}") return False # Check for extremely small boxes (potential errors) if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5: print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.") # Continue but with a warning # Convert to absolute pixel coordinates using round() abs_x_min = round(x_min_norm / 1000 * screen_width) abs_y_min = round(y_min_norm / 1000 * screen_height) abs_x_max = round(x_max_norm / 1000 * screen_width) abs_y_max = round(y_max_norm / 1000 * screen_height) # --- Calculate center point with additional checks --- # Check for boundary anomalies if abs_x_max - abs_x_min < 2: print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise") if abs_y_max - abs_y_min < 2: print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise") # Calculate center with floating point precision, then round at the end center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2) center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2) print(f"Identified '{label}' for double-click at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]") print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]") print(f"Calculated double-click center: ({center_x}, {center_y})") # --- Enhanced Safety Checks --- # Add margin from edge of screen (5 pixels) SCREEN_MARGIN = 5 if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN: print(f"Warning: Double-click X-coordinate ({center_x}) is very close to screen edge") # Adjust to safe zone if needed center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1)) print(f"Adjusted X-coordinate to: {center_x}") if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN: print(f"Warning: Double-click Y-coordinate ({center_y}) is very close to screen edge") # Adjust to safe zone if needed center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1)) print(f"Adjusted Y-coordinate to: {center_y}") # Extra check for valid range if not (0 <= center_x < screen_width and 0 <= center_y < screen_height): print(f"Error: Calculated double-click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!") print(f"Derived from normalized box: {box}") return False # --- Perform Action with improved reliability --- try: print(f"Moving mouse to ({center_x}, {center_y}) and double-clicking '{label}'...") # First move to a position near the target to avoid potential edge-case issues current_x, current_y = pyautogui.position() if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100: intermediate_x = current_x + (center_x - current_x) // 2 intermediate_y = current_y + (center_y - current_y) // 2 pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1) # Then move to the exact target with slightly slower movement for precision pyautogui.moveTo(center_x, center_y, duration=0.3) # Brief pause before clicking time.sleep(0.1) # Double click pyautogui.doubleClick() print("Double-click performed.") # Verify cursor position after click after_x, after_y = pyautogui.position() if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5: print(f"Warning: Cursor position after double-click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})") return True except Exception as e: print(f"Error during mouse action: {e}") return False def perform_type(location_data: dict): """Clicks the specified location (if provided) and types the given text.""" if not location_data: print("No valid location data found for typing.") return False text_to_type = location_data.get("text_to_type") label = location_data.get("label", "Typing Action") # Get label if available if not text_to_type: print(f"Error: 'text_to_type' missing in data for typing action: {location_data}") return False # --- Step 1: Click the target field (if box_2d is provided) --- # Reuse perform_click logic to activate the field if "box_2d" in location_data: print(f"Clicking field '{label}' before typing...") click_successful = perform_click(location_data) # Use the existing click function if not click_successful: print(f"Failed to click field '{label}' before typing. Aborting type action.") return False time.sleep(0.2) # Small delay after click before typing else: print("Warning: No 'box_2d' provided for 'type' action. Typing at current cursor position.") # Consider if this is desired behavior or should be an error # --- Step 2: Perform Typing --- try: print(f"Typing text into '{label}': '{text_to_type}'") # Add a small interval between key presses for reliability, especially in slower apps pyautogui.write(text_to_type, interval=0.05) print("Typing performed.") return True except Exception as e: print(f"Error during typing action: {e}") return False def perform_press_key(location_data: dict): """ Presses a specific keyboard key or performs a keyboard shortcut. Handles single keys (e.g., 'enter', 'a', 'f5') using pyautogui.press() and shortcuts (e.g., 'ctrl+c', 'alt+tab', 'ctrl+alt+delete') using pyautogui.hotkey(). Shortcuts are identified by the presence of '+' in the 'key_to_press' string. Args: location_data: A dictionary containing action details. Expected keys: - 'key_to_press' (str): The key or shortcut string (e.g., 'enter', 'ctrl+c'). - 'label' (str, optional): A descriptive label for the action. Returns: bool: True if the action was performed successfully, False otherwise. Raises: Potentially any exception from pyautogui if the key/shortcut string is malformed or contains names pyautogui does not recognize. """ if not location_data: print("No valid location data found for key press/shortcut.") return False key_action_string = location_data.get("key_to_press") # Use the provided label or create a default one based on the action string label = location_data.get("label", f"Key Action '{key_action_string}'") if not key_action_string: print(f"Error: 'key_to_press' missing in data for key action: {location_data}") return False # Standardize to lowercase for execution consistency key_action_string_lower = key_action_string.lower() is_shortcut = '+' in key_action_string_lower # --- Perform Key Press or Shortcut --- try: if is_shortcut: # Prepare the list of keys for hotkey action_keys = [key.strip() for key in key_action_string_lower.split('+')] # Basic validation for empty components after split if not action_keys or any(not k for k in action_keys): print(f"Error: Invalid shortcut format '{key_action_string}'. Check for extra '+' or empty parts.") return False print(f"Performing shortcut: '{key_action_string}' (Action label: '{label}')") pyautogui.hotkey(*action_keys) # Unpack the list into arguments print("Shortcut performed.") else: # Single key press # Basic validation for empty key string if not key_action_string_lower.strip(): print(f"Error: 'key_to_press' is empty or whitespace only.") return False print(f"Pressing key: '{key_action_string}' (Action label: '{label}')") pyautogui.press(key_action_string_lower) print("Key press performed.") # Optional: Add a small delay after the action # time.sleep(0.1) return True except Exception as e: # Catch potential errors from pyautogui (e.g., invalid key names) # PyAutoGUI might raise various errors, including KeyError or platform-specific ones print(f"Error during key action '{key_action_string}': {e}") # You might want to inspect the type of exception 'e' for more specific handling return False max_steps = 100 # Safety break to prevent infinite loops def do_task(original_task, task_not_complted): print("IN do_task()") step_count = 0 try: while not task_not_complted and step_count < max_steps: step_count += 1 print(f"\n--- Step {step_count} for Task: '{original_task}' ---") # 1. Capture Screen print("Capturing screen...") screen_image = capture_screen() # Parse the json input step_info = parse_json_safely(original_task) # 3. Extract info from Gemini's response action_type = step_info.get("action_type") # Get the action type task_not_completed = step_info.get("task_not_completed", False) # Default to False if missing label = step_info.get("label", "Unknown Action/Element") # Get label print(f"Gemini identified action: '{action_type}', Target/Label: '{label}'. Task completed this step: {not task_not_completed}") # 4. Perform Action based on type - only support click and double_click action_successful = False # Initialize before action attempt if action_type == "click": action_successful = perform_click(step_info) elif action_type == "double_click": action_successful = perform_double_click(step_info) elif action_type not in ["click", "double_click"]: # Handle invalid or unsupported action types (keyboard actions should go to function model) print(f"Error: Action type '{action_type}' is not supported. Only 'click' and 'double_click' are allowed. Keyboard actions should be handled by the function calling model.") action_successful = False else: # This case handles missing action_type print(f"Error: Unknown or missing 'action_type' ('{action_type}') received.") action_successful = False # 5. Check if action failed and abort if necessary if not action_successful: print(f"Failed to perform action '{action_type}' on '{label}'. Aborting task.") break # Exit the loop on failure # 6. Check if task is finished based on Gemini's flag if not task_not_completed: print(f"Task '{original_task}' reported as complete after action '{action_type}'.") break # Exit the loop, task is done return "STEP COMPLETED. CHECK BY YOURSELF IF IT HAS ACTUALLY BEEN DONE OR NOT. DO NOT ASK THE USER - JUST VIEW THE RECORDING." except Exception as e: print(f"\n--- An unexpected error occurred in the main loop ---") print(f"Error: {e}") import traceback traceback.print_exc() # Print detailed traceback for debugging print("-------------------------------------------------------")