Spaces:

SolarumAsteridion
/

Lenith

Running

File size: 28,702 Bytes

314597a

# --- START OF FILE computer_control_helper.py ---

from PIL import Image, ImageDraw, ImageFont
import pyautogui
import mss # For screen capture
import json
import os
import time, datetime

def parse_json_safely(json_string: str) -> dict:
    """
    Attempts to parse a JSON object (dictionary) from a string,
    handling potential errors, markdown fences, and surrounding text.

    It looks for the first '{' and the last '}' to extract the potential JSON part.
    """
    if not isinstance(json_string, str):
        print(f"Warning: Input is not a string, but type {type(json_string)}. Returning empty dict.")
        return {}

    try:
        # --- Stage 1: Basic Cleaning and Markdown Fence Removal ---
        # Remove leading/trailing whitespace
        clean_string = json_string.strip()

        # More robust markdown fence removal
        # Check if ```json exists and find content after it
        md_json_start = clean_string.find("```json")
        if md_json_start != -1:
            # Find the start of the actual JSON content after ```json
            potential_start = md_json_start + 7 # Length of "```json"
            # Handle optional newline after ```json
            if potential_start < len(clean_string) and clean_string[potential_start] == '\n':
                 potential_start += 1
            clean_string = clean_string[potential_start:]

        # Find the last ``` and take content before it
        md_end = clean_string.rfind("```")
        if md_end != -1:
            clean_string = clean_string[:md_end]

        # Strip again after potential fence removal
        clean_string = clean_string.strip()

        if not clean_string:
            # print("Warning: String is empty after cleaning markdown fences.")
            # Avoid printing warning if original string wasn't just fences
            if json_string.strip() and json_string.strip() != "```json```" and json_string.strip() != "```json\n```":
                 print("Warning: String is empty after cleaning markdown fences.")
            return {}

        # --- Stage 2: Find the JSON Object Boundaries ---
        first_brace = clean_string.find('{')
        last_brace = clean_string.rfind('}')

        if first_brace == -1 or last_brace == -1 or last_brace < first_brace:
            # print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
            # print(f"Cleaned string: '{clean_string}'")
             # Avoid printing warning if the original string clearly wasn't meant to be JSON
            if '{' in json_string or '}' in json_string:
                print(f"Warning: Could not find valid {{...}} structure in the cleaned string.")
                print(f"Cleaned string snippet: '{clean_string[:100]}...{clean_string[-100:]}'") # Show snippet
            return {}

        # Extract the potential JSON substring
        potential_json = clean_string[first_brace : last_brace + 1]

        # --- Stage 3: Parse the Extracted Substring ---
        parsed = json.loads(potential_json)

        # --- Stage 4: Validate the Parsed Structure ---
        if isinstance(parsed, dict):
            return parsed
        # Handle the case where Gemini (or other LLM) might wrap a dict in a list
        elif isinstance(parsed, list):
            if len(parsed) == 1 and isinstance(parsed[0], dict):
                print("Warning: JSON was wrapped in a list, extracting the single dictionary.")
                return parsed[0]
            else:
                # It's a list, but not the specific list-of-one-dict structure we handle
                print(f"Warning: Parsed JSON is a list, not a dictionary or list-of-one-dict. Content: {parsed}")
                return {} # Return empty dict as the function promises a dict
        else:
            # Parsed successfully, but it's not a dict or the handled list case
            print(f"Warning: Parsed JSON is not a dictionary (type: {type(parsed)}). Content: {parsed}")
            return {}

    except json.JSONDecodeError as e:
        print(f"Error parsing extracted JSON: {e}")
        print(f"Extracted substring: '{potential_json}'") # Show the problematic substring
        # print(f"Original string: '{json_string}'") # Optionally show original too
        return {}
    except Exception as e:
        # Catch any other unexpected errors during the process
        print(f"An unexpected error occurred during JSON extraction or parsing: {e}")
        print(f"Original string: '{json_string}'")
        return {}

def capture_screen() -> Image.Image | None:
    """Captures the primary monitor's screen and returns it as a PIL Image."""
    try:
        with mss.mss() as sct:
            # Attempt to get the primary monitor; mss behavior can vary.
            # monitor[0] is often the 'all monitors' view, monitor[1] the primary.
            monitor_index = 1
            if len(sct.monitors) <= monitor_index:
                print(f"Warning: Monitor index {monitor_index} not found, using monitor 0.")
                monitor_index = 0 # Fallback

            monitor = sct.monitors[monitor_index]
            sct_img = sct.grab(monitor)
            img = Image.frombytes("RGB", sct_img.size, sct_img.bgra, "raw", "BGRX")
            # Optional: Resize if needed, but higher res is better for Gemini
            # img.thumbnail((1920, 1080), Image.Resampling.LANCZOS)
            print(f"Screen captured ({img.width}x{img.height})")
            return img
    except Exception as e:
        print(f"Error capturing screen: {e}")
        return None

def draw_grid_overlay(img):
    """Draws a grid overlay on the image with 10 horizontal lines and proportional vertical lines."""
    try:
        # Make a copy of the image to avoid modifying the original
        img_copy = img.copy()
        draw = ImageDraw.Draw(img_copy)
        
        # Get image dimensions
        width, height = img_copy.size
        
        # Try to load a font for coordinate labels - INCREASED SIZE
        try:
            font = ImageFont.truetype("arial.ttf", 24)  # Increased from 14 to 24
        except:
            font = None
            print("Warning: Could not load font for grid labels. Using default.")
        
        # Create 10 horizontal lines with coordinate labels (0-1000)
        h_spacing = height // 10
        for i in range(1, 10):
            y = i * h_spacing
            # Draw the horizontal line
            draw.line([(0, y), (width, y)], fill=(255, 0, 0), width=2)  # Increased line width
            
            # Add coordinate label - normalize to 0-1000 scale
            y_norm = int((i / 10) * 1000)
            label = f"{y_norm}"
            # Draw label background for better visibility
            if font:
                # Make background larger and more visible
                text_bbox = draw.textbbox((10, y-12), label, font=font)
                # Expand the bbox slightly
                expanded_bbox = (
                    text_bbox[0] - 5,
                    text_bbox[1] - 5,
                    text_bbox[2] + 5,
                    text_bbox[3] + 5
                )
                draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220))  # More opaque background
                draw.text((10, y-12), label, fill=(255, 0, 0), font=font)
            else:
                draw.rectangle([(5, y-10), (45, y+10)], fill=(255, 255, 255, 220))
                draw.text((5, y-7), label, fill=(255, 0, 0))
        
        # Calculate number of vertical lines to maintain the same ratio
        aspect_ratio = width / height
        v_lines = int(10 * aspect_ratio)
        v_spacing = width // v_lines
        
        # Create vertical lines with coordinate labels (0-1000)
        for i in range(1, v_lines):
            x = i * v_spacing
            # Draw the vertical line
            draw.line([(x, 0), (x, height)], fill=(255, 0, 0), width=2)  # Increased line width
            
            # Add coordinate label - normalize to 0-1000 scale
            x_norm = int((i / v_lines) * 1000)
            label = f"{x_norm}"
            # Draw label background for better visibility
            if font:
                # Make background larger and more visible
                text_bbox = draw.textbbox((x+5, 10), label, font=font)
                # Expand the bbox slightly
                expanded_bbox = (
                    text_bbox[0] - 5,
                    text_bbox[1] - 5,
                    text_bbox[2] + 5,
                    text_bbox[3] + 5
                )
                draw.rectangle(expanded_bbox, fill=(255, 255, 255, 220))  # More opaque background
                draw.text((x+5, 10), label, fill=(255, 0, 0), font=font)
            else:
                draw.rectangle([(x+2, 5), (x+45, 25)], fill=(255, 255, 255, 220))
                draw.text((x+2, 5), label, fill=(255, 0, 0))
        
        return img_copy
    except Exception as e:
        print(f"Error drawing grid overlay: {e}")
        return img  # Return original image if there's an error

def perform_click(location_data: dict):
    """Calculates center of bounding box and performs a mouse click."""
    if not location_data or "box_2d" not in location_data:
        print("No valid location data found to perform click.")
        return False

    box = location_data.get("box_2d")
    label = location_data.get("label", "Unknown Element") # Get label if available

    # Validate box format
    if not isinstance(box, list) or len(box) != 4:
        print(f"Error: Invalid bounding box format received: {box}")
        return False

    # Get screen dimensions (important for coordinate translation)
    try:
        screen_width, screen_height = pyautogui.size()
        # ADDED: Print detected screen size for verification
        print(f"Detected screen dimensions: {screen_width}x{screen_height}")
    except Exception as e:
        print(f"Error getting screen size: {e}")
        return False

    # Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
    y_min_norm, x_min_norm, y_max_norm, x_max_norm = box

    # Enhanced validation of normalized coordinates with detailed error messages
    for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
        if not isinstance(coord, (int, float)):
            print(f"Error: {name} coordinate is not a number: {coord}")
            return False
        if coord < 0 or coord > 1000:
            print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
            return False

    # Check for zero or negative-area boxes
    if x_min_norm >= x_max_norm:
        print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
        return False
    if y_min_norm >= y_max_norm:
        print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
        return False

    # Check for extremely small boxes (potential errors)
    if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
        print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
        # Continue but with a warning

    # --- Convert to absolute pixel coordinates using round() ---
    # Explanation: norm_coord / 1000 gives the ratio (0.0 to 1.0)
    # Multiply by screen dimension to get the pixel position.
    # round() converts to the nearest integer pixel.
    abs_x_min = round(x_min_norm / 1000 * screen_width)
    abs_y_min = round(y_min_norm / 1000 * screen_height)
    abs_x_max = round(x_max_norm / 1000 * screen_width)
    abs_y_max = round(y_max_norm / 1000 * screen_height)

    # --- Calculate center point with additional checks ---
    # Check for boundary anomalies
    if abs_x_max - abs_x_min < 2:
        print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
    if abs_y_max - abs_y_min < 2:
        print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")

    # Calculate center with floating point precision, then round at the end
    center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
    center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)

    print(f"Identified '{label}' at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
    print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
    print(f"Calculated click center: ({center_x}, {center_y})")

    # --- Enhanced Safety Checks ---
    # Add margin from edge of screen (5 pixels)
    SCREEN_MARGIN = 5
    if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
        print(f"Warning: Click X-coordinate ({center_x}) is very close to screen edge")
        # Adjust to safe zone if needed
        center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
        print(f"Adjusted X-coordinate to: {center_x}")
    
    if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
        print(f"Warning: Click Y-coordinate ({center_y}) is very close to screen edge")
        # Adjust to safe zone if needed
        center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
        print(f"Adjusted Y-coordinate to: {center_y}")

    # Extra check for valid range
    if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
        print(f"Error: Calculated click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
        # Add more info if possible
        print(f"Derived from normalized box: {box}")
        return False

    # --- Perform Action with improved reliability ---
    try:
        print(f"Moving mouse to ({center_x}, {center_y}) and clicking '{label}'...")
        
        # First move to a position near the target to avoid potential edge-case issues
        current_x, current_y = pyautogui.position()
        if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
            intermediate_x = current_x + (center_x - current_x) // 2
            intermediate_y = current_y + (center_y - current_y) // 2
            pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)
        
        # Then move to the exact target with slightly slower movement for precision
        pyautogui.moveTo(center_x, center_y, duration=0.3)
        
        # Brief pause before clicking
        time.sleep(0.1)
        
        # Click
        pyautogui.click()
        print("Click performed.")
        
        # Verify cursor position after click
        after_x, after_y = pyautogui.position()
        if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
            print(f"Warning: Cursor position after click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")
        
        return True
    except Exception as e:
        print(f"Error during mouse action: {e}")
        return False

def perform_double_click(location_data: dict):
    """Calculates center of bounding box and performs a mouse double-click."""
    if not location_data or "box_2d" not in location_data:
        print("No valid location data found to perform double-click.")
        return False

    box = location_data.get("box_2d")
    label = location_data.get("label", "Unknown Element") # Get label if available

    # Validate box format
    if not isinstance(box, list) or len(box) != 4:
        print(f"Error: Invalid bounding box format received: {box}")
        return False

    # Get screen dimensions (important for coordinate translation)
    try:
        screen_width, screen_height = pyautogui.size()
        # ADDED: Print detected screen size for verification
        print(f"Detected screen dimensions: {screen_width}x{screen_height}")
    except Exception as e:
        print(f"Error getting screen size: {e}")
        return False

    # Denormalize coordinates (Gemini uses y_min, x_min, y_max, x_max from 0-1000)
    y_min_norm, x_min_norm, y_max_norm, x_max_norm = box

    # Enhanced validation of normalized coordinates with detailed error messages
    for i, (coord, name) in enumerate(zip(box, ["y_min", "x_min", "y_max", "x_max"])):
        if not isinstance(coord, (int, float)):
            print(f"Error: {name} coordinate is not a number: {coord}")
            return False
        if coord < 0 or coord > 1000:
            print(f"Error: {name} coordinate out of range [0, 1000]: {coord}")
            return False

    # Check for zero or negative-area boxes
    if x_min_norm >= x_max_norm:
        print(f"Error: Invalid x-coordinates (min >= max): {x_min_norm} >= {x_max_norm}")
        return False
    if y_min_norm >= y_max_norm:
        print(f"Error: Invalid y-coordinates (min >= max): {y_min_norm} >= {y_max_norm}")
        return False

    # Check for extremely small boxes (potential errors)
    if x_max_norm - x_min_norm < 5 or y_max_norm - y_min_norm < 5:
        print(f"Warning: Very small target area detected ({x_max_norm - x_min_norm}x{y_max_norm - y_min_norm}). This might be inaccurate.")
        # Continue but with a warning

    # Convert to absolute pixel coordinates using round()
    abs_x_min = round(x_min_norm / 1000 * screen_width)
    abs_y_min = round(y_min_norm / 1000 * screen_height)
    abs_x_max = round(x_max_norm / 1000 * screen_width)
    abs_y_max = round(y_max_norm / 1000 * screen_height)

    # --- Calculate center point with additional checks ---
    # Check for boundary anomalies
    if abs_x_max - abs_x_min < 2:
        print(f"Warning: X dimension is very small ({abs_x_max - abs_x_min}px), centering might be imprecise")
    if abs_y_max - abs_y_min < 2:
        print(f"Warning: Y dimension is very small ({abs_y_max - abs_y_min}px), centering might be imprecise")

    # Calculate center with floating point precision, then round at the end
    center_x = round(abs_x_min + (abs_x_max - abs_x_min) / 2)
    center_y = round(abs_y_min + (abs_y_max - abs_y_min) / 2)

    print(f"Identified '{label}' for double-click at normalized box: [{y_min_norm}, {x_min_norm}, {y_max_norm}, {x_max_norm}]")
    print(f"Converted to absolute box: [{abs_y_min}, {abs_x_min}, {abs_y_max}, {abs_x_max}]")
    print(f"Calculated double-click center: ({center_x}, {center_y})")

    # --- Enhanced Safety Checks ---
    # Add margin from edge of screen (5 pixels)
    SCREEN_MARGIN = 5
    if center_x < SCREEN_MARGIN or center_x >= screen_width - SCREEN_MARGIN:
        print(f"Warning: Double-click X-coordinate ({center_x}) is very close to screen edge")
        # Adjust to safe zone if needed
        center_x = max(SCREEN_MARGIN, min(center_x, screen_width - SCREEN_MARGIN - 1))
        print(f"Adjusted X-coordinate to: {center_x}")
    
    if center_y < SCREEN_MARGIN or center_y >= screen_height - SCREEN_MARGIN:
        print(f"Warning: Double-click Y-coordinate ({center_y}) is very close to screen edge")
        # Adjust to safe zone if needed
        center_y = max(SCREEN_MARGIN, min(center_y, screen_height - SCREEN_MARGIN - 1))
        print(f"Adjusted Y-coordinate to: {center_y}")

    # Extra check for valid range
    if not (0 <= center_x < screen_width and 0 <= center_y < screen_height):
        print(f"Error: Calculated double-click coordinates ({center_x}, {center_y}) are outside screen bounds ({screen_width}x{screen_height})!")
        print(f"Derived from normalized box: {box}")
        return False

    # --- Perform Action with improved reliability ---
    try:
        print(f"Moving mouse to ({center_x}, {center_y}) and double-clicking '{label}'...")
        
        # First move to a position near the target to avoid potential edge-case issues
        current_x, current_y = pyautogui.position()
        if abs(current_x - center_x) > 100 or abs(current_y - center_y) > 100:
            intermediate_x = current_x + (center_x - current_x) // 2
            intermediate_y = current_y + (center_y - current_y) // 2
            pyautogui.moveTo(intermediate_x, intermediate_y, duration=0.1)
        
        # Then move to the exact target with slightly slower movement for precision
        pyautogui.moveTo(center_x, center_y, duration=0.3)
        
        # Brief pause before clicking
        time.sleep(0.1)
        
        # Double click
        pyautogui.doubleClick()
        print("Double-click performed.")
        
        # Verify cursor position after click
        after_x, after_y = pyautogui.position()
        if abs(after_x - center_x) > 5 or abs(after_y - center_y) > 5:
            print(f"Warning: Cursor position after double-click ({after_x}, {after_y}) differs from target ({center_x}, {center_y})")
        
        return True
    except Exception as e:
        print(f"Error during mouse action: {e}")
        return False

def perform_type(location_data: dict):
    """Clicks the specified location (if provided) and types the given text."""
    if not location_data:
        print("No valid location data found for typing.")
        return False

    text_to_type = location_data.get("text_to_type")
    label = location_data.get("label", "Typing Action") # Get label if available

    if not text_to_type:
        print(f"Error: 'text_to_type' missing in data for typing action: {location_data}")
        return False

    # --- Step 1: Click the target field (if box_2d is provided) ---
    # Reuse perform_click logic to activate the field
    if "box_2d" in location_data:
        print(f"Clicking field '{label}' before typing...")
        click_successful = perform_click(location_data) # Use the existing click function
        if not click_successful:
            print(f"Failed to click field '{label}' before typing. Aborting type action.")
            return False
        time.sleep(0.2) # Small delay after click before typing
    else:
        print("Warning: No 'box_2d' provided for 'type' action. Typing at current cursor position.")
        # Consider if this is desired behavior or should be an error

    # --- Step 2: Perform Typing ---
    try:
        print(f"Typing text into '{label}': '{text_to_type}'")
        # Add a small interval between key presses for reliability, especially in slower apps
        pyautogui.write(text_to_type, interval=0.05)
        print("Typing performed.")
        return True
    except Exception as e:
        print(f"Error during typing action: {e}")
        return False

def perform_press_key(location_data: dict):
    """
    Presses a specific keyboard key or performs a keyboard shortcut.

    Handles single keys (e.g., 'enter', 'a', 'f5') using pyautogui.press()
    and shortcuts (e.g., 'ctrl+c', 'alt+tab', 'ctrl+alt+delete') using
    pyautogui.hotkey(). Shortcuts are identified by the presence of '+'
    in the 'key_to_press' string.

    Args:
        location_data: A dictionary containing action details. Expected keys:
            - 'key_to_press' (str): The key or shortcut string (e.g., 'enter', 'ctrl+c').
            - 'label' (str, optional): A descriptive label for the action.

    Returns:
        bool: True if the action was performed successfully, False otherwise.

    Raises:
        Potentially any exception from pyautogui if the key/shortcut string is
        malformed or contains names pyautogui does not recognize.
    """
    if not location_data:
        print("No valid location data found for key press/shortcut.")
        return False

    key_action_string = location_data.get("key_to_press")
    # Use the provided label or create a default one based on the action string
    label = location_data.get("label", f"Key Action '{key_action_string}'")

    if not key_action_string:
        print(f"Error: 'key_to_press' missing in data for key action: {location_data}")
        return False

    # Standardize to lowercase for execution consistency
    key_action_string_lower = key_action_string.lower()

    is_shortcut = '+' in key_action_string_lower

    # --- Perform Key Press or Shortcut ---
    try:
        if is_shortcut:
            # Prepare the list of keys for hotkey
            action_keys = [key.strip() for key in key_action_string_lower.split('+')]
            # Basic validation for empty components after split
            if not action_keys or any(not k for k in action_keys):
                 print(f"Error: Invalid shortcut format '{key_action_string}'. Check for extra '+' or empty parts.")
                 return False

            print(f"Performing shortcut: '{key_action_string}' (Action label: '{label}')")
            pyautogui.hotkey(*action_keys) # Unpack the list into arguments
            print("Shortcut performed.")
        else:
            # Single key press
            # Basic validation for empty key string
            if not key_action_string_lower.strip():
                print(f"Error: 'key_to_press' is empty or whitespace only.")
                return False

            print(f"Pressing key: '{key_action_string}' (Action label: '{label}')")
            pyautogui.press(key_action_string_lower)
            print("Key press performed.")

        # Optional: Add a small delay after the action
        # time.sleep(0.1)

        return True

    except Exception as e:
        # Catch potential errors from pyautogui (e.g., invalid key names)
        # PyAutoGUI might raise various errors, including KeyError or platform-specific ones
        print(f"Error during key action '{key_action_string}': {e}")
        # You might want to inspect the type of exception 'e' for more specific handling
        return False


max_steps = 100 # Safety break to prevent infinite loops


def do_task(original_task, task_not_complted):
    print("IN do_task()")
    step_count = 0
    try:
        while not task_not_complted and step_count < max_steps:
                step_count += 1
                print(f"\n--- Step {step_count} for Task: '{original_task}' ---")

                # 1. Capture Screen
                print("Capturing screen...")
                screen_image = capture_screen()

                # Parse the json input
                step_info = parse_json_safely(original_task)

                # 3. Extract info from Gemini's response
                action_type = step_info.get("action_type") # Get the action type
                task_not_completed = step_info.get("task_not_completed", False) # Default to False if missing
                label = step_info.get("label", "Unknown Action/Element") # Get label

                print(f"Gemini identified action: '{action_type}', Target/Label: '{label}'. Task completed this step: {not task_not_completed}")

                # 4. Perform Action based on type - only support click and double_click
                action_successful = False # Initialize before action attempt

                if action_type == "click":
                    action_successful = perform_click(step_info)
                elif action_type == "double_click":
                    action_successful = perform_double_click(step_info)
                elif action_type not in ["click", "double_click"]:
                    # Handle invalid or unsupported action types (keyboard actions should go to function model)
                    print(f"Error: Action type '{action_type}' is not supported. Only 'click' and 'double_click' are allowed. Keyboard actions should be handled by the function calling model.")
                    action_successful = False
                else:
                    # This case handles missing action_type
                    print(f"Error: Unknown or missing 'action_type' ('{action_type}') received.")
                    action_successful = False

                # 5. Check if action failed and abort if necessary
                if not action_successful:
                    print(f"Failed to perform action '{action_type}' on '{label}'. Aborting task.")
                    break  # Exit the loop on failure

                # 6. Check if task is finished based on Gemini's flag
                if not task_not_completed:
                    print(f"Task '{original_task}' reported as complete after action '{action_type}'.")
                    break  # Exit the loop, task is done

                return "STEP COMPLETED. CHECK BY YOURSELF IF IT HAS ACTUALLY BEEN DONE OR NOT. DO NOT ASK THE USER - JUST VIEW THE RECORDING."
 

    except Exception as e:
            print(f"\n--- An unexpected error occurred in the main loop ---")
            print(f"Error: {e}")
            import traceback
            traceback.print_exc() # Print detailed traceback for debugging
            print("-------------------------------------------------------")