Spaces:
Running
Running
# --- START OF FILE computer_control_helper.py --- | |
from PIL import Image, ImageDraw, ImageFont | |
import pyautogui | |
import mss # For screen capture | |
import json | |
import os | |
import time, datetime | |
def parse_json_safely(json_string: str) -> dict: | |
""" | |
Attempts to parse a JSON object (dictionary) from a string, | |
handling potential errors, markdown fences, and surrounding text. | |
It looks for the first '{' and the last '}' to extract the potential JSON part. | |
""" | |
if not isinstance(json_string, str): | |
print(f"Warning: Input is not a string, but type {type(json_string)}. Returning empty dict.") | |
return {} | |
try: | |
# --- Stage 1: Basic Cleaning and Markdown Fence Removal --- | |
# Remove leading/trailing whitespace | |
clean_string = json_string.strip() | |
# More robust markdown fence removal | |
# Check if ```json exists and find content after it | |
md_json_start = clean_string.find("```json") | |
if md_json_start != -1: | |
# Find the start of the actual JSON content after ```json | |
potential_start = md_json_start + 7 # Length of "```json" | |
# Handle optional newline after ```json | |
if potential_start < len(clean_string) and clean_string[potential_start] == '\n': | |
potential_start += 1 | |
clean_string = clean_string[potential_start:] | |
# Find the last ``` and take content before it | |
md_end = clean_string.rfind("```") | |
if md_end != -1: | |
clean_string = clean_string[:md_end] | |
# Strip again after potential fence removal | |
clean_string = clean_string.strip() | |
if not clean_string: | |
# print("Warning: String is empty after cleaning markdown fences.") | |
# Avoid printing warning if original string wasn't just fences | |
if json_string.strip() and json_string.strip() != "```json```" and json_string.strip() != "```json\n```": | |
print("Warning: String is empty after cleaning markdown fences.") | |
return {} | |
# --- Stage 2: Find the JSON Object Boundaries --- | |
first_brace = clean_string.find('{') | |
last_brace = clean_string.rfind('}') | |
if first_brace == -1 or last_brace == -1 or last_brace < first_brace: | |
# print(f"Warning: Could not find valid {{...}} structure in the cleaned string.") | |
# print(f"Cleaned string: '{clean_string}'") | |
# Avoid printing warning if the original string clearly wasn't meant to be JSON | |
if '{' in json_string or '}' in json_string: | |
print(f"Warning: Could not find valid {{...}} structure in the cleaned string.") | |
print(f"Cleaned string snippet: '{clean_string[:100]}...{clean_string[-100:]}'") # Show snippet | |
return {} | |
# Extract the potential JSON substring | |
potential_json = clean_string[first_brace : last_brace + 1] | |
# --- Stage 3: Parse the Extracted Substring --- | |
parsed = json.loads(potential_json) | |
# --- Stage 4: Validate the Parsed Structure --- | |
if isinstance(parsed, dict): | |
return parsed | |
# Handle the case where Gemini (or other LLM) might wrap a dict in a list | |
elif isinstance(parsed, list): | |
if len(parsed) == 1 and isinstance(parsed[0], dict): | |
print("Warning: JSON was wrapped in a list, extracting the single dictionary.") | |
return parsed[0] | |
else: | |
# It's a list, but not the specific list-of-one-dict structure we handle | |
print(f"Warning: Parsed JSON is a list, not a dictionary or list-of-one-dict. Content: {parsed}") | |
return {} # Return empty dict as the function promises a dict | |
else: | |
# Parsed successfully, but it's not a dict or the handled list case | |
print(f"Warning: Parsed JSON is not a dictionary (type: {type(parsed)}). Content: {parsed}") | |
return {} | |
except json.JSONDecodeError as e: | |
print(f"Error parsing extracted JSON: {e}") | |
print(f"Extracted substring: '{potential_json}'") # Show the problematic substring | |
# print(f"Original string: '{json_string}'") # Optionally show original too | |
return {} | |
except Exception as e: | |
# Catch any other unexpected errors during the process | |
print(f"An unexpected error occurred during JSON extraction or parsing: {e}") | |
print(f"Original string: '{json_string}'") | |
return {} | |
def capture_screen() -> Image.Image | None: | |
"""Captures the primary monitor's screen and returns it as a PIL Image.""" | |
try: | |
with mss.mss() as sct: | |
# Attempt to get the primary monitor; mss behavior can vary. | |
# monitor[0] is often the 'all monitors' view, monitor[1] the primary. | |
monitor_index = 1 | |
if len(sct.monitors) <= monitor_index: | |
print(f"Warning: Monitor index {monitor_index} not found, using monitor 0.") | |
monitor_index = 0 # Fallback | |
monitor = sct.monitors[monitor_index] | |
sct_img = sct.grab(monitor) | |
img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX") | |
# Optional: Resize if needed, but higher res is better for Gemini | |
# img.thumbnail((1920, 1080), Image.Resampling.LANCZOS) | |
print(f"Screen captured ({img.width}x{img.height})") | |
return img | |
except Exception as e: | |
print(f"Error capturing screen: {e}") | |
return None | |
def draw_grid_overlay(img): | |
"""Draws a grid overlay on the image with 10 horizontal lines and proportional vertical lines.""" | |
try: | |
# Make a copy of the image to avoid modifying the original | |
img_copy = img.copy() | |
draw = ImageDraw.Draw(img_copy) | |
# Get image dimensions | |
width, height = img_copy.size | |
# Try to load a font for coordinate labels - INCREASED SIZE | |
try: | |
font = ImageFont.truetype("arial.ttf", 24) # Increased from 14 to 24 | |
except: | |
font = None | |
print("Warning: Could not load font for grid labels. Using default.") | |
# Create 10 horizontal lines with coordinate labels (0-1000) | |
h_spacing = height // 10 | |
for i in range(1, 10): | |
y = i * h_spacing | |
# Draw the horizontal line | |
draw.line([(0, y), (width, y)], fill=(255, 0, 0), width=2) # Increased line width | |
# Add coordinate label - normalize to 0-1000 scale | |
y_norm = int((i / 10) * 1000) | |
label = f"{y_norm}" | |
# Draw label background for better visibility | |
if font: | |
# Make background larger and more visible | |
text_bbox = draw.textbbox((10, y-12), label, font=font) | |
# Expand the bbox slightly | |
expanded_bbox = ( | |
text_bbox[0] - 5, | |
text_bbox[1] - 5, | |
text_bbox[2] + 5, | |
text_bbox[3] + 5 | |
) | |
draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background | |
draw.text((10, y-12), label, fill=(255, 0, 0), font=font) | |
else: | |
draw.rectangle([(5, y-10), (45, y+10)], fill=(255, 255, 255, 220)) | |
draw.text((5, y-7), label, fill=(255, 0, 0)) | |
# Calculate number of vertical lines to maintain the same ratio | |
aspect_ratio = width / height | |
v_lines = int(10 * aspect_ratio) | |
v_spacing = width // v_lines | |
# Create vertical lines with coordinate labels (0-1000) | |
for i in range(1, v_lines): | |
x = i * v_spacing | |
# Draw the vertical line | |
draw.line([(x, 0), (x, height)], fill=(255, 0, 0), width=2) # Increased line width | |
# Add coordinate label - normalize to 0-1000 scale | |
x_norm = int((i / v_lines) * 1000) | |
label = f"{x_norm}" | |
# Draw label background for better visibility | |
if font: | |
# Make background larger and more visible | |
text_bbox = draw.textbbox((x+5, 10), label, font=font) | |
# Expand the bbox slightly | |
expanded_bbox = ( | |
text_bbox[0] - 5, | |
text_bbox[1] - 5, | |
text_bbox[2] + 5, | |
text_bbox[3] + 5 | |
) | |
draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background | |
draw.text((x+5, 10), label, fill=(255, 0, 0), font=font) | |
else: | |
draw.rectangle([(x+2, 5), (x+45, 25)], fill=(255, 255, 255, 220)) | |
draw.text((x+2, 5), label, fill=(255, 0, 0)) | |
return img_copy | |
except Exception as e: | |
print(f"Error drawing grid overlay: {e}") | |
return img # Return original image if there's an error | |
def perform_click(location_data: dict): | |
"""Calculates center of bounding box and performs a mouse click.""" | |
if not location_data or "box_2d" not in location_data: | |
print("No valid location data found to perform click.") | |
return False | |
box = location_data.get("box_2d") | |
label = location_data.get("label", "Unknown Element") # Get label if available | |
# Validate box format | |
if not isinstance(box, list) or len(box) != 4: | |
print(f"Error: Invalid bounding box format received: {box}") | |
return False | |
# Get screen dimensions (important for coordinate translation) | |
try: | |
screen_width, screen_height = pyautogui.size() | |
# ADDED: Print detected screen size for verification | |
print(f"Detected screen dimensions: {screen_width}x{screen_height}") | |
except Exception as e: | |
print(f"Error getting screen size: {e}") | |
return False | |
# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000) | |
y_min_norm, x_min_norm, y_max_norm, x_max_norm = box | |
# Enhanced validation of normalized coordinates with detailed error messages | |
for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])): | |
if not isinstance(coord, (int, float)): | |
print(f"Error: {name} coordinate is not a number: {coord}") | |
return False | |
if coord < 0 or coord > 1000: | |
print(f"Error: {name} coordinate out of range [0, 1000]: {coord}") | |
return False | |
# Check for zero or negative-area boxes | |
if x_min_norm >= x_max_norm: | |
print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}") | |
return False | |
if y_min_norm >= y_max_norm: | |
print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}") | |
return False | |
# Check for extremely small boxes (potential errors) | |
if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5: | |
print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.") | |
# Continue but with a warning | |
# --- Convert to absolute pixel coordinates using round() --- | |
# Explanation: norm_coord / 1000 gives the ratio (0.0 to 1.0) | |
# Multiply by screen dimension to get the pixel position. | |
# round() converts to the nearest integer pixel. | |
abs_x_min = round(x_min_norm / 1000 * screen_width) | |
abs_y_min = round(y_min_norm / 1000 * screen_height) | |
abs_x_max = round(x_max_norm / 1000 * screen_width) | |
abs_y_max = round(y_max_norm / 1000 * screen_height) | |
# --- Calculate center point with additional checks --- | |
# Check for boundary anomalies | |
if abs_x_max - abs_x_min < 2: | |
print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise") | |
if abs_y_max - abs_y_min < 2: | |
print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise") | |
# Calculate center with floating point precision, then round at the end | |
center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2) | |
center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2) | |
print(f"Identified '{label}' at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]") | |
print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]") | |
print(f"Calculated click center: ({center_x}, {center_y})") | |
# --- Enhanced Safety Checks --- | |
# Add margin from edge of screen (5 pixels) | |
SCREEN_MARGIN = 5 | |
if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN: | |
print(f"Warning: Click X-coordinate ({center_x}) is very close to screen edge") | |
# Adjust to safe zone if needed | |
center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1)) | |
print(f"Adjusted X-coordinate to: {center_x}") | |
if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN: | |
print(f"Warning: Click Y-coordinate ({center_y}) is very close to screen edge") | |
# Adjust to safe zone if needed | |
center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1)) | |
print(f"Adjusted Y-coordinate to: {center_y}") | |
# Extra check for valid range | |
if not (0 <= center_x < screen_width and 0 <= center_y < screen_height): | |
print(f"Error: Calculated click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!") | |
# Add more info if possible | |
print(f"Derived from normalized box: {box}") | |
return False | |
# --- Perform Action with improved reliability --- | |
try: | |
print(f"Moving mouse to ({center_x}, {center_y}) and clicking '{label}'...") | |
# First move to a position near the target to avoid potential edge-case issues | |
current_x, current_y = pyautogui.position() | |
if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100: | |
intermediate_x = current_x + (center_x - current_x) // 2 | |
intermediate_y = current_y + (center_y - current_y) // 2 | |
pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1) | |
# Then move to the exact target with slightly slower movement for precision | |
pyautogui.moveTo(center_x, center_y, duration=0.3) | |
# Brief pause before clicking | |
time.sleep(0.1) | |
# Click | |
pyautogui.click() | |
print("Click performed.") | |
# Verify cursor position after click | |
after_x, after_y = pyautogui.position() | |
if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5: | |
print(f"Warning: Cursor position after click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})") | |
return True | |
except Exception as e: | |
print(f"Error during mouse action: {e}") | |
return False | |
def perform_double_click(location_data: dict): | |
"""Calculates center of bounding box and performs a mouse double-click.""" | |
if not location_data or "box_2d" not in location_data: | |
print("No valid location data found to perform double-click.") | |
return False | |
box = location_data.get("box_2d") | |
label = location_data.get("label", "Unknown Element") # Get label if available | |
# Validate box format | |
if not isinstance(box, list) or len(box) != 4: | |
print(f"Error: Invalid bounding box format received: {box}") | |
return False | |
# Get screen dimensions (important for coordinate translation) | |
try: | |
screen_width, screen_height = pyautogui.size() | |
# ADDED: Print detected screen size for verification | |
print(f"Detected screen dimensions: {screen_width}x{screen_height}") | |
except Exception as e: | |
print(f"Error getting screen size: {e}") | |
return False | |
# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000) | |
y_min_norm, x_min_norm, y_max_norm, x_max_norm = box | |
# Enhanced validation of normalized coordinates with detailed error messages | |
for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])): | |
if not isinstance(coord, (int, float)): | |
print(f"Error: {name} coordinate is not a number: {coord}") | |
return False | |
if coord < 0 or coord > 1000: | |
print(f"Error: {name} coordinate out of range [0, 1000]: {coord}") | |
return False | |
# Check for zero or negative-area boxes | |
if x_min_norm >= x_max_norm: | |
print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}") | |
return False | |
if y_min_norm >= y_max_norm: | |
print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}") | |
return False | |
# Check for extremely small boxes (potential errors) | |
if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5: | |
print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.") | |
# Continue but with a warning | |
# Convert to absolute pixel coordinates using round() | |
abs_x_min = round(x_min_norm / 1000 * screen_width) | |
abs_y_min = round(y_min_norm / 1000 * screen_height) | |
abs_x_max = round(x_max_norm / 1000 * screen_width) | |
abs_y_max = round(y_max_norm / 1000 * screen_height) | |
# --- Calculate center point with additional checks --- | |
# Check for boundary anomalies | |
if abs_x_max - abs_x_min < 2: | |
print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise") | |
if abs_y_max - abs_y_min < 2: | |
print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise") | |
# Calculate center with floating point precision, then round at the end | |
center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2) | |
center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2) | |
print(f"Identified '{label}' for double-click at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]") | |
print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]") | |
print(f"Calculated double-click center: ({center_x}, {center_y})") | |
# --- Enhanced Safety Checks --- | |
# Add margin from edge of screen (5 pixels) | |
SCREEN_MARGIN = 5 | |
if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN: | |
print(f"Warning: Double-click X-coordinate ({center_x}) is very close to screen edge") | |
# Adjust to safe zone if needed | |
center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1)) | |
print(f"Adjusted X-coordinate to: {center_x}") | |
if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN: | |
print(f"Warning: Double-click Y-coordinate ({center_y}) is very close to screen edge") | |
# Adjust to safe zone if needed | |
center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1)) | |
print(f"Adjusted Y-coordinate to: {center_y}") | |
# Extra check for valid range | |
if not (0 <= center_x < screen_width and 0 <= center_y < screen_height): | |
print(f"Error: Calculated double-click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!") | |
print(f"Derived from normalized box: {box}") | |
return False | |
# --- Perform Action with improved reliability --- | |
try: | |
print(f"Moving mouse to ({center_x}, {center_y}) and double-clicking '{label}'...") | |
# First move to a position near the target to avoid potential edge-case issues | |
current_x, current_y = pyautogui.position() | |
if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100: | |
intermediate_x = current_x + (center_x - current_x) // 2 | |
intermediate_y = current_y + (center_y - current_y) // 2 | |
pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1) | |
# Then move to the exact target with slightly slower movement for precision | |
pyautogui.moveTo(center_x, center_y, duration=0.3) | |
# Brief pause before clicking | |
time.sleep(0.1) | |
# Double click | |
pyautogui.doubleClick() | |
print("Double-click performed.") | |
# Verify cursor position after click | |
after_x, after_y = pyautogui.position() | |
if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5: | |
print(f"Warning: Cursor position after double-click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})") | |
return True | |
except Exception as e: | |
print(f"Error during mouse action: {e}") | |
return False | |
def perform_type(location_data: dict): | |
"""Clicks the specified location (if provided) and types the given text.""" | |
if not location_data: | |
print("No valid location data found for typing.") | |
return False | |
text_to_type = location_data.get("text_to_type") | |
label = location_data.get("label", "Typing Action") # Get label if available | |
if not text_to_type: | |
print(f"Error: 'text_to_type' missing in data for typing action: {location_data}") | |
return False | |
# --- Step 1: Click the target field (if box_2d is provided) --- | |
# Reuse perform_click logic to activate the field | |
if "box_2d" in location_data: | |
print(f"Clicking field '{label}' before typing...") | |
click_successful = perform_click(location_data) # Use the existing click function | |
if not click_successful: | |
print(f"Failed to click field '{label}' before typing. Aborting type action.") | |
return False | |
time.sleep(0.2) # Small delay after click before typing | |
else: | |
print("Warning: No 'box_2d' provided for 'type' action. Typing at current cursor position.") | |
# Consider if this is desired behavior or should be an error | |
# --- Step 2: Perform Typing --- | |
try: | |
print(f"Typing text into '{label}': '{text_to_type}'") | |
# Add a small interval between key presses for reliability, especially in slower apps | |
pyautogui.write(text_to_type, interval=0.05) | |
print("Typing performed.") | |
return True | |
except Exception as e: | |
print(f"Error during typing action: {e}") | |
return False | |
def perform_press_key(location_data: dict): | |
""" | |
Presses a specific keyboard key or performs a keyboard shortcut. | |
Handles single keys (e.g., 'enter', 'a', 'f5') using pyautogui.press() | |
and shortcuts (e.g., 'ctrl+c', 'alt+tab', 'ctrl+alt+delete') using | |
pyautogui.hotkey(). Shortcuts are identified by the presence of '+' | |
in the 'key_to_press' string. | |
Args: | |
location_data: A dictionary containing action details. Expected keys: | |
- 'key_to_press' (str): The key or shortcut string (e.g., 'enter', 'ctrl+c'). | |
- 'label' (str, optional): A descriptive label for the action. | |
Returns: | |
bool: True if the action was performed successfully, False otherwise. | |
Raises: | |
Potentially any exception from pyautogui if the key/shortcut string is | |
malformed or contains names pyautogui does not recognize. | |
""" | |
if not location_data: | |
print("No valid location data found for key press/shortcut.") | |
return False | |
key_action_string = location_data.get("key_to_press") | |
# Use the provided label or create a default one based on the action string | |
label = location_data.get("label", f"Key Action '{key_action_string}'") | |
if not key_action_string: | |
print(f"Error: 'key_to_press' missing in data for key action: {location_data}") | |
return False | |
# Standardize to lowercase for execution consistency | |
key_action_string_lower = key_action_string.lower() | |
is_shortcut = '+' in key_action_string_lower | |
# --- Perform Key Press or Shortcut --- | |
try: | |
if is_shortcut: | |
# Prepare the list of keys for hotkey | |
action_keys = [key.strip() for key in key_action_string_lower.split('+')] | |
# Basic validation for empty components after split | |
if not action_keys or any(not k for k in action_keys): | |
print(f"Error: Invalid shortcut format '{key_action_string}'. Check for extra '+' or empty parts.") | |
return False | |
print(f"Performing shortcut: '{key_action_string}' (Action label: '{label}')") | |
pyautogui.hotkey(*action_keys) # Unpack the list into arguments | |
print("Shortcut performed.") | |
else: | |
# Single key press | |
# Basic validation for empty key string | |
if not key_action_string_lower.strip(): | |
print(f"Error: 'key_to_press' is empty or whitespace only.") | |
return False | |
print(f"Pressing key: '{key_action_string}' (Action label: '{label}')") | |
pyautogui.press(key_action_string_lower) | |
print("Key press performed.") | |
# Optional: Add a small delay after the action | |
# time.sleep(0.1) | |
return True | |
except Exception as e: | |
# Catch potential errors from pyautogui (e.g., invalid key names) | |
# PyAutoGUI might raise various errors, including KeyError or platform-specific ones | |
print(f"Error during key action '{key_action_string}': {e}") | |
# You might want to inspect the type of exception 'e' for more specific handling | |
return False | |
max_steps = 100 # Safety break to prevent infinite loops | |
def do_task(original_task, task_not_complted): | |
print("IN do_task()") | |
step_count = 0 | |
try: | |
while not task_not_complted and step_count < max_steps: | |
step_count += 1 | |
print(f"\n--- Step {step_count} for Task: '{original_task}' ---") | |
# 1. Capture Screen | |
print("Capturing screen...") | |
screen_image = capture_screen() | |
# Parse the json input | |
step_info = parse_json_safely(original_task) | |
# 3. Extract info from Gemini's response | |
action_type = step_info.get("action_type") # Get the action type | |
task_not_completed = step_info.get("task_not_completed", False) # Default to False if missing | |
label = step_info.get("label", "Unknown Action/Element") # Get label | |
print(f"Gemini identified action: '{action_type}', Target/Label: '{label}'. Task completed this step: {not task_not_completed}") | |
# 4. Perform Action based on type - only support click and double_click | |
action_successful = False # Initialize before action attempt | |
if action_type == "click": | |
action_successful = perform_click(step_info) | |
elif action_type == "double_click": | |
action_successful = perform_double_click(step_info) | |
elif action_type not in ["click", "double_click"]: | |
# Handle invalid or unsupported action types (keyboard actions should go to function model) | |
print(f"Error: Action type '{action_type}' is not supported. Only 'click' and 'double_click' are allowed. Keyboard actions should be handled by the function calling model.") | |
action_successful = False | |
else: | |
# This case handles missing action_type | |
print(f"Error: Unknown or missing 'action_type' ('{action_type}') received.") | |
action_successful = False | |
# 5. Check if action failed and abort if necessary | |
if not action_successful: | |
print(f"Failed to perform action '{action_type}' on '{label}'. Aborting task.") | |
break # Exit the loop on failure | |
# 6. Check if task is finished based on Gemini's flag | |
if not task_not_completed: | |
print(f"Task '{original_task}' reported as complete after action '{action_type}'.") | |
break # Exit the loop, task is done | |
return "STEP COMPLETED. CHECK BY YOURSELF IF IT HAS ACTUALLY BEEN DONE OR NOT. DO NOT ASK THE USER - JUST VIEW THE RECORDING." | |
except Exception as e: | |
print(f"\n--- An unexpected error occurred in the main loop ---") | |
print(f"Error: {e}") | |
import traceback | |
traceback.print_exc() # Print detailed traceback for debugging | |
print("-------------------------------------------------------") |