Spaces:

SolarumAsteridion
/

Lenith

Running

Lenith / computer_control_helper.py

Tecnhotron

First

e78c9e1 3 months ago

28.7 kB

	# --- START OF FILE computer_control_helper.py ---

	from PIL import Image, ImageDraw, ImageFont
	import pyautogui
	import mss # For screen capture
	import json
	import os
	import time, datetime

	def parse_json_safely(json_string: str) -> dict:
	"""
	Attempts to parse a JSON object (dictionary) from a string,
	handling potential errors, markdown fences, and surrounding text.

	It looks for the first '{' and the last '}' to extract the potential JSON part.
	"""
	if not isinstance(json_string, str):
	print(f"Warning: Input is not a string, but type {type(json_string)}. Returning empty dict.")
	return {}

	try:
	# --- Stage 1: Basic Cleaning and Markdown Fence Removal ---
	# Remove leading/trailing whitespace
	clean_string = json_string.strip()

	# More robust markdown fence removal
	# Check if ```json exists and find content after it
	md_json_start = clean_string.find("```json")
	if md_json_start != -1:
	# Find the start of the actual JSON content after ```json
	potential_start = md_json_start + 7 # Length of "```json"
	# Handle optional newline after ```json
	if potential_start < len(clean_string) and clean_string[potential_start] == '\n':
	potential_start += 1
	clean_string = clean_string[potential_start:]

	# Find the last ``` and take content before it
	md_end = clean_string.rfind("```")
	if md_end != -1:
	clean_string = clean_string[:md_end]

	# Strip again after potential fence removal
	clean_string = clean_string.strip()

	if not clean_string:
	# print("Warning: String is empty after cleaning markdown fences.")
	# Avoid printing warning if original string wasn't just fences
	if json_string.strip() and json_string.strip() != "```json```" and json_string.strip() != "```json\n```":
	print("Warning: String is empty after cleaning markdown fences.")
	return {}

	# --- Stage 2: Find the JSON Object Boundaries ---
	first_brace = clean_string.find('{')
	last_brace = clean_string.rfind('}')

	if first_brace == -1 or last_brace == -1 or last_brace < first_brace:
	# print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
	# print(f"Cleaned string: '{clean_string}'")
	# Avoid printing warning if the original string clearly wasn't meant to be JSON
	if '{' in json_string or '}' in json_string:
	print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
	print(f"Cleaned string snippet: '{clean_string[:100]}...{clean_string[-100:]}'") # Show snippet
	return {}

	# Extract the potential JSON substring
	potential_json = clean_string[first_brace : last_brace + 1]

	# --- Stage 3: Parse the Extracted Substring ---
	parsed = json.loads(potential_json)

	# --- Stage 4: Validate the Parsed Structure ---
	if isinstance(parsed, dict):
	return parsed
	# Handle the case where Gemini (or other LLM) might wrap a dict in a list
	elif isinstance(parsed, list):
	if len(parsed) == 1 and isinstance(parsed[0], dict):
	print("Warning: JSON was wrapped in a list, extracting the single dictionary.")
	return parsed[0]
	else:
	# It's a list, but not the specific list-of-one-dict structure we handle
	print(f"Warning: Parsed JSON is a list, not a dictionary or list-of-one-dict. Content: {parsed}")
	return {} # Return empty dict as the function promises a dict
	else:
	# Parsed successfully, but it's not a dict or the handled list case
	print(f"Warning: Parsed JSON is not a dictionary (type: {type(parsed)}). Content: {parsed}")
	return {}

	except json.JSONDecodeError as e:
	print(f"Error parsing extracted JSON: {e}")
	print(f"Extracted substring: '{potential_json}'") # Show the problematic substring
	# print(f"Original string: '{json_string}'") # Optionally show original too
	return {}
	except Exception as e:
	# Catch any other unexpected errors during the process
	print(f"An unexpected error occurred during JSON extraction or parsing: {e}")
	print(f"Original string: '{json_string}'")
	return {}

	def capture_screen() -> Image.Image \| None:
	"""Captures the primary monitor's screen and returns it as a PIL Image."""
	try:
	with mss.mss() as sct:
	# Attempt to get the primary monitor; mss behavior can vary.
	# monitor[0] is often the 'all monitors' view, monitor[1] the primary.
	monitor_index = 1
	if len(sct.monitors) <= monitor_index:
	print(f"Warning: Monitor index {monitor_index} not found, using monitor 0.")
	monitor_index = 0 # Fallback

	monitor = sct.monitors[monitor_index]
	sct_img = sct.grab(monitor)
	img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
	# Optional: Resize if needed, but higher res is better for Gemini
	# img.thumbnail((1920, 1080), Image.Resampling.LANCZOS)
	print(f"Screen captured ({img.width}x{img.height})")
	return img
	except Exception as e:
	print(f"Error capturing screen: {e}")
	return None

	def draw_grid_overlay(img):
	"""Draws a grid overlay on the image with 10 horizontal lines and proportional vertical lines."""
	try:
	# Make a copy of the image to avoid modifying the original
	img_copy = img.copy()
	draw = ImageDraw.Draw(img_copy)

	# Get image dimensions
	width, height = img_copy.size

	# Try to load a font for coordinate labels - INCREASED SIZE
	try:
	font = ImageFont.truetype("arial.ttf", 24) # Increased from 14 to 24
	except:
	font = None
	print("Warning: Could not load font for grid labels. Using default.")

	# Create 10 horizontal lines with coordinate labels (0-1000)
	h_spacing = height // 10
	for i in range(1, 10):
	y = i * h_spacing
	# Draw the horizontal line
	draw.line([(0, y), (width, y)], fill=(255, 0, 0), width=2) # Increased line width

	# Add coordinate label - normalize to 0-1000 scale
	y_norm = int((i / 10) * 1000)
	label = f"{y_norm}"
	# Draw label background for better visibility
	if font:
	# Make background larger and more visible
	text_bbox = draw.textbbox((10, y-12), label, font=font)
	# Expand the bbox slightly
	expanded_bbox = (
	text_bbox[0] - 5,
	text_bbox[1] - 5,
	text_bbox[2] + 5,
	text_bbox[3] + 5
	)
	draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background
	draw.text((10, y-12), label, fill=(255, 0, 0), font=font)
	else:
	draw.rectangle([(5, y-10), (45, y+10)], fill=(255, 255, 255, 220))
	draw.text((5, y-7), label, fill=(255, 0, 0))

	# Calculate number of vertical lines to maintain the same ratio
	aspect_ratio = width / height
	v_lines = int(10 * aspect_ratio)
	v_spacing = width // v_lines

	# Create vertical lines with coordinate labels (0-1000)
	for i in range(1, v_lines):
	x = i * v_spacing
	# Draw the vertical line
	draw.line([(x, 0), (x, height)], fill=(255, 0, 0), width=2) # Increased line width

	# Add coordinate label - normalize to 0-1000 scale
	x_norm = int((i / v_lines) * 1000)
	label = f"{x_norm}"
	# Draw label background for better visibility
	if font:
	# Make background larger and more visible
	text_bbox = draw.textbbox((x+5, 10), label, font=font)
	# Expand the bbox slightly
	expanded_bbox = (
	text_bbox[0] - 5,
	text_bbox[1] - 5,
	text_bbox[2] + 5,
	text_bbox[3] + 5
	)
	draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220)) # More opaque background
	draw.text((x+5, 10), label, fill=(255, 0, 0), font=font)
	else:
	draw.rectangle([(x+2, 5), (x+45, 25)], fill=(255, 255, 255, 220))
	draw.text((x+2, 5), label, fill=(255, 0, 0))

	return img_copy
	except Exception as e:
	print(f"Error drawing grid overlay: {e}")
	return img # Return original image if there's an error

	def perform_click(location_data: dict):
	"""Calculates center of bounding box and performs a mouse click."""
	if not location_data or "box_2d" not in location_data:
	print("No valid location data found to perform click.")
	return False

	box = location_data.get("box_2d")
	label = location_data.get("label", "Unknown Element") # Get label if available

	# Validate box format
	if not isinstance(box, list) or len(box) != 4:
	print(f"Error: Invalid bounding box format received: {box}")
	return False

	# Get screen dimensions (important for coordinate translation)
	try:
	screen_width, screen_height = pyautogui.size()
	# ADDED: Print detected screen size for verification
	print(f"Detected screen dimensions: {screen_width}x{screen_height}")
	except Exception as e:
	print(f"Error getting screen size: {e}")
	return False

	# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
	y_min_norm, x_min_norm, y_max_norm, x_max_norm = box

	# Enhanced validation of normalized coordinates with detailed error messages
	for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
	if not isinstance(coord, (int, float)):
	print(f"Error: {name} coordinate is not a number: {coord}")
	return False
	if coord < 0 or coord > 1000:
	print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
	return False

	# Check for zero or negative-area boxes
	if x_min_norm >= x_max_norm:
	print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
	return False
	if y_min_norm >= y_max_norm:
	print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
	return False

	# Check for extremely small boxes (potential errors)
	if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
	print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
	# Continue but with a warning

	# --- Convert to absolute pixel coordinates using round() ---
	# Explanation: norm_coord / 1000 gives the ratio (0.0 to 1.0)
	# Multiply by screen dimension to get the pixel position.
	# round() converts to the nearest integer pixel.
	abs_x_min = round(x_min_norm / 1000 * screen_width)
	abs_y_min = round(y_min_norm / 1000 * screen_height)
	abs_x_max = round(x_max_norm / 1000 * screen_width)
	abs_y_max = round(y_max_norm / 1000 * screen_height)

	# --- Calculate center point with additional checks ---
	# Check for boundary anomalies
	if abs_x_max - abs_x_min < 2:
	print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
	if abs_y_max - abs_y_min < 2:
	print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")

	# Calculate center with floating point precision, then round at the end
	center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
	center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)

	print(f"Identified '{label}' at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
	print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
	print(f"Calculated click center: ({center_x}, {center_y})")

	# --- Enhanced Safety Checks ---
	# Add margin from edge of screen (5 pixels)
	SCREEN_MARGIN = 5
	if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
	print(f"Warning: Click X-coordinate ({center_x}) is very close to screen edge")
	# Adjust to safe zone if needed
	center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
	print(f"Adjusted X-coordinate to: {center_x}")

	if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
	print(f"Warning: Click Y-coordinate ({center_y}) is very close to screen edge")
	# Adjust to safe zone if needed
	center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
	print(f"Adjusted Y-coordinate to: {center_y}")

	# Extra check for valid range
	if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
	print(f"Error: Calculated click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
	# Add more info if possible
	print(f"Derived from normalized box: {box}")
	return False

	# --- Perform Action with improved reliability ---
	try:
	print(f"Moving mouse to ({center_x}, {center_y}) and clicking '{label}'...")

	# First move to a position near the target to avoid potential edge-case issues
	current_x, current_y = pyautogui.position()
	if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
	intermediate_x = current_x + (center_x - current_x) // 2
	intermediate_y = current_y + (center_y - current_y) // 2
	pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)

	# Then move to the exact target with slightly slower movement for precision
	pyautogui.moveTo(center_x, center_y, duration=0.3)

	# Brief pause before clicking
	time.sleep(0.1)

	# Click
	pyautogui.click()
	print("Click performed.")

	# Verify cursor position after click
	after_x, after_y = pyautogui.position()
	if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
	print(f"Warning: Cursor position after click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")

	return True
	except Exception as e:
	print(f"Error during mouse action: {e}")
	return False

	def perform_double_click(location_data: dict):
	"""Calculates center of bounding box and performs a mouse double-click."""
	if not location_data or "box_2d" not in location_data:
	print("No valid location data found to perform double-click.")
	return False

	box = location_data.get("box_2d")
	label = location_data.get("label", "Unknown Element") # Get label if available

	# Validate box format
	if not isinstance(box, list) or len(box) != 4:
	print(f"Error: Invalid bounding box format received: {box}")
	return False

	# Get screen dimensions (important for coordinate translation)
	try:
	screen_width, screen_height = pyautogui.size()
	# ADDED: Print detected screen size for verification
	print(f"Detected screen dimensions: {screen_width}x{screen_height}")
	except Exception as e:
	print(f"Error getting screen size: {e}")
	return False

	# Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
	y_min_norm, x_min_norm, y_max_norm, x_max_norm = box

	# Enhanced validation of normalized coordinates with detailed error messages
	for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
	if not isinstance(coord, (int, float)):
	print(f"Error: {name} coordinate is not a number: {coord}")
	return False
	if coord < 0 or coord > 1000:
	print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
	return False

	# Check for zero or negative-area boxes
	if x_min_norm >= x_max_norm:
	print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
	return False
	if y_min_norm >= y_max_norm:
	print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
	return False

	# Check for extremely small boxes (potential errors)
	if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
	print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
	# Continue but with a warning

	# Convert to absolute pixel coordinates using round()
	abs_x_min = round(x_min_norm / 1000 * screen_width)
	abs_y_min = round(y_min_norm / 1000 * screen_height)
	abs_x_max = round(x_max_norm / 1000 * screen_width)
	abs_y_max = round(y_max_norm / 1000 * screen_height)

	# --- Calculate center point with additional checks ---
	# Check for boundary anomalies
	if abs_x_max - abs_x_min < 2:
	print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
	if abs_y_max - abs_y_min < 2:
	print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")

	# Calculate center with floating point precision, then round at the end
	center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
	center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)

	print(f"Identified '{label}' for double-click at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
	print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
	print(f"Calculated double-click center: ({center_x}, {center_y})")

	# --- Enhanced Safety Checks ---
	# Add margin from edge of screen (5 pixels)
	SCREEN_MARGIN = 5
	if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
	print(f"Warning: Double-click X-coordinate ({center_x}) is very close to screen edge")
	# Adjust to safe zone if needed
	center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
	print(f"Adjusted X-coordinate to: {center_x}")

	if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
	print(f"Warning: Double-click Y-coordinate ({center_y}) is very close to screen edge")
	# Adjust to safe zone if needed
	center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
	print(f"Adjusted Y-coordinate to: {center_y}")

	# Extra check for valid range
	if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
	print(f"Error: Calculated double-click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
	print(f"Derived from normalized box: {box}")
	return False

	# --- Perform Action with improved reliability ---
	try:
	print(f"Moving mouse to ({center_x}, {center_y}) and double-clicking '{label}'...")

	# First move to a position near the target to avoid potential edge-case issues
	current_x, current_y = pyautogui.position()
	if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
	intermediate_x = current_x + (center_x - current_x) // 2
	intermediate_y = current_y + (center_y - current_y) // 2
	pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)

	# Then move to the exact target with slightly slower movement for precision
	pyautogui.moveTo(center_x, center_y, duration=0.3)

	# Brief pause before clicking
	time.sleep(0.1)

	# Double click
	pyautogui.doubleClick()
	print("Double-click performed.")

	# Verify cursor position after click
	after_x, after_y = pyautogui.position()
	if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
	print(f"Warning: Cursor position after double-click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")

	return True
	except Exception as e:
	print(f"Error during mouse action: {e}")
	return False

	def perform_type(location_data: dict):
	"""Clicks the specified location (if provided) and types the given text."""
	if not location_data:
	print("No valid location data found for typing.")
	return False

	text_to_type = location_data.get("text_to_type")
	label = location_data.get("label", "Typing Action") # Get label if available

	if not text_to_type:
	print(f"Error: 'text_to_type' missing in data for typing action: {location_data}")
	return False

	# --- Step 1: Click the target field (if box_2d is provided) ---
	# Reuse perform_click logic to activate the field
	if "box_2d" in location_data:
	print(f"Clicking field '{label}' before typing...")
	click_successful = perform_click(location_data) # Use the existing click function
	if not click_successful:
	print(f"Failed to click field '{label}' before typing. Aborting type action.")
	return False
	time.sleep(0.2) # Small delay after click before typing
	else:
	print("Warning: No 'box_2d' provided for 'type' action. Typing at current cursor position.")
	# Consider if this is desired behavior or should be an error

	# --- Step 2: Perform Typing ---
	try:
	print(f"Typing text into '{label}': '{text_to_type}'")
	# Add a small interval between key presses for reliability, especially in slower apps
	pyautogui.write(text_to_type, interval=0.05)
	print("Typing performed.")
	return True
	except Exception as e:
	print(f"Error during typing action: {e}")
	return False

	def perform_press_key(location_data: dict):
	"""
	Presses a specific keyboard key or performs a keyboard shortcut.

	Handles single keys (e.g., 'enter', 'a', 'f5') using pyautogui.press()
	and shortcuts (e.g., 'ctrl+c', 'alt+tab', 'ctrl+alt+delete') using
	pyautogui.hotkey(). Shortcuts are identified by the presence of '+'
	in the 'key_to_press' string.

	Args:
	location_data: A dictionary containing action details. Expected keys:
	- 'key_to_press' (str): The key or shortcut string (e.g., 'enter', 'ctrl+c').
	- 'label' (str, optional): A descriptive label for the action.

	Returns:
	bool: True if the action was performed successfully, False otherwise.

	Raises:
	Potentially any exception from pyautogui if the key/shortcut string is
	malformed or contains names pyautogui does not recognize.
	"""
	if not location_data:
	print("No valid location data found for key press/shortcut.")
	return False

	key_action_string = location_data.get("key_to_press")
	# Use the provided label or create a default one based on the action string
	label = location_data.get("label", f"Key Action '{key_action_string}'")

	if not key_action_string:
	print(f"Error: 'key_to_press' missing in data for key action: {location_data}")
	return False

	# Standardize to lowercase for execution consistency
	key_action_string_lower = key_action_string.lower()

	is_shortcut = '+' in key_action_string_lower

	# --- Perform Key Press or Shortcut ---
	try:
	if is_shortcut:
	# Prepare the list of keys for hotkey
	action_keys = [key.strip() for key in key_action_string_lower.split('+')]
	# Basic validation for empty components after split
	if not action_keys or any(not k for k in action_keys):
	print(f"Error: Invalid shortcut format '{key_action_string}'. Check for extra '+' or empty parts.")
	return False

	print(f"Performing shortcut: '{key_action_string}' (Action label: '{label}')")
	pyautogui.hotkey(*action_keys) # Unpack the list into arguments
	print("Shortcut performed.")
	else:
	# Single key press
	# Basic validation for empty key string
	if not key_action_string_lower.strip():
	print(f"Error: 'key_to_press' is empty or whitespace only.")
	return False

	print(f"Pressing key: '{key_action_string}' (Action label: '{label}')")
	pyautogui.press(key_action_string_lower)
	print("Key press performed.")

	# Optional: Add a small delay after the action
	# time.sleep(0.1)

	return True

	except Exception as e:
	# Catch potential errors from pyautogui (e.g., invalid key names)
	# PyAutoGUI might raise various errors, including KeyError or platform-specific ones
	print(f"Error during key action '{key_action_string}': {e}")
	# You might want to inspect the type of exception 'e' for more specific handling
	return False


	max_steps = 100 # Safety break to prevent infinite loops


	def do_task(original_task, task_not_complted):
	print("IN do_task()")
	step_count = 0
	try:
	while not task_not_complted and step_count < max_steps:
	step_count += 1
	print(f"\n--- Step {step_count} for Task: '{original_task}' ---")

	# 1. Capture Screen
	print("Capturing screen...")
	screen_image = capture_screen()

	# Parse the json input
	step_info = parse_json_safely(original_task)

	# 3. Extract info from Gemini's response
	action_type = step_info.get("action_type") # Get the action type
	task_not_completed = step_info.get("task_not_completed", False) # Default to False if missing
	label = step_info.get("label", "Unknown Action/Element") # Get label

	print(f"Gemini identified action: '{action_type}', Target/Label: '{label}'. Task completed this step: {not task_not_completed}")

	# 4. Perform Action based on type - only support click and double_click
	action_successful = False # Initialize before action attempt

	if action_type == "click":
	action_successful = perform_click(step_info)
	elif action_type == "double_click":
	action_successful = perform_double_click(step_info)
	elif action_type not in ["click", "double_click"]:
	# Handle invalid or unsupported action types (keyboard actions should go to function model)
	print(f"Error: Action type '{action_type}' is not supported. Only 'click' and 'double_click' are allowed. Keyboard actions should be handled by the function calling model.")
	action_successful = False
	else:
	# This case handles missing action_type
	print(f"Error: Unknown or missing 'action_type' ('{action_type}') received.")
	action_successful = False

	# 5. Check if action failed and abort if necessary
	if not action_successful:
	print(f"Failed to perform action '{action_type}' on '{label}'. Aborting task.")
	break # Exit the loop on failure

	# 6. Check if task is finished based on Gemini's flag
	if not task_not_completed:
	print(f"Task '{original_task}' reported as complete after action '{action_type}'.")
	break # Exit the loop, task is done

	return "STEP COMPLETED. CHECK BY YOURSELF IF IT HAS ACTUALLY BEEN DONE OR NOT. DO NOT ASK THE USER - JUST VIEW THE RECORDING."


	except Exception as e:
	print(f"\n--- An unexpected error occurred in the main loop ---")
	print(f"Error: {e}")
	import traceback
	traceback.print_exc() # Print detailed traceback for debugging
	print("-------------------------------------------------------")