Spaces:
Sleeping
Sleeping
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str: | |
""" | |
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence. | |
Args: | |
text: The text to search for | |
nth_result: Which occurrence to jump to (default: 1) | |
""" | |
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]") | |
if nth_result > len(elements): | |
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)") | |
result = f"Found {len(elements)} matches for '{text}'." | |
elem = elements[nth_result - 1] | |
driver.execute_script("arguments[0].scrollIntoView(true);", elem) | |
result += f"Focused on element {nth_result} of {len(elements)}" | |
return result | |
def go_back() -> None: | |
"""Goes back to previous page.""" | |
driver.back() | |
def close_popups() -> str: | |
""" | |
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners. | |
""" | |
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform() | |
def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None: | |
sleep(1.0) # Let JavaScript animations happen before taking the screenshot | |
driver = helium.get_driver() | |
current_step = step_log.step_number | |
if driver is not None: | |
for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing | |
if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2: | |
step_logs.observations_images = None | |
png_bytes = driver.get_screenshot_as_png() | |
image = Image.open(BytesIO(png_bytes)) | |
print(f"Captured a browser screenshot: {image.size} pixels") | |
step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important! | |
# Update observations with current URL | |
url_info = f"Current url: {driver.current_url}" | |
step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info | |
return | |
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool | |
model = OpenAIServerModel(model_id="gpt-4o") | |
############# OpenAIServerModel: Connects to any service that offers an OpenAI API interface. | |
#model = OpenAIServerModel( | |
# model_id="gpt-4o", | |
# api_base="https://api.openai.com/v1", | |
# api_key=os.environ["OPENAI_API_KEY"], | |
#) | |
agent = CodeAgent( | |
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f], | |
model=model, | |
additional_authorized_imports=["helium"], | |
step_callbacks=[save_screenshot], | |
max_steps=20, | |
verbosity_level=2, | |
) | |
prompt_analysis="""Extract information from an image by analyzing and interpreting its visual elements to provide a detailed description or identify specific data. | |
# Steps | |
1. **Analyze the Image**: Identify key elements such as objects, text, colors, and any notable features or contexts. | |
2. **Interpret Visual Elements**: Determine the significance or purpose of the elements identified. Consider relationships between objects, text recognition if applicable, and any context clues. | |
3. **Synthesize Information**: Bring together the interpreted elements to form a coherent understanding or summary. | |
4. **Verify Details**: Ensure accuracy by cross-referencing identifiable text or icons with known data or references, if relevant. | |
# Output Format | |
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each identified element should be clearly described along with its context or significance. | |
# Example | |
**Input**: (An image with a storefront displaying 'Bakery' sign and a variety of bread on display.) | |
**Output**: | |
Description: | |
- **Storefront**: A bakery | |
- **Signage Text**: "Bakery" | |
- **Products**: Various types of bread | |
JSON Example: | |
```json | |
{ | |
"storeType": "Bakery", | |
"signText": "Bakery", | |
"products": ["bread", "baguette", "pastry"] | |
} | |
``` | |
# Notes | |
- Consider optical character recognition (OCR) for text extraction. | |
- Evaluate colors and objects for brand or function associations. | |
- Provide a holistic overview rather than disjointed elements when possible.""" | |
prompt_deep_analysis="""Extract information from a video by analyzing and interpreting its audiovisual elements to provide a detailed description or identify specific data. | |
You will have specific information to retrieve from the video. Adapt analysis steps to cater for motion, audio, and potential scene changes unique to video content. | |
# Steps | |
1. **Parse the Video**: Break down the video into manageable segments, focusing on scenes or timeframes relevant to the target information. | |
2. **Identify Key Elements**: Within these segments, identify crucial visual and audio elements such as objects, text, dialogue, sounds, and any notable features or contexts. | |
3. **Interpret Audiovisual Elements**: Determine the significance or purpose of the identified elements. Consider relationships between objects, text recognition, audio cues, and any context provided by the video. | |
4. **Synthesize Information**: Integrate the interpreted elements to form a coherent understanding or summary. | |
5. **Verify Details**: Ensure accuracy by cross-referencing identifiable text, icons, or audio snippets with known data or references, if relevant. | |
# Output Format | |
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each element should be described along with its context or significance within the video. | |
# Examples | |
**Input**: (A video of a cooking show with captions and background music.) | |
**Output**: | |
Description: | |
- **Scene**: Cooking demonstration of a pasta dish | |
- **Captions**: Step-by-step instructions | |
- **Audio**: Background music, presenter dialogue | |
- **Visual Elements**: Ingredients and cooking utensils | |
JSON Example: | |
```json | |
{ | |
"sceneType": "Cooking Demonstration", | |
"captions": ["Boil water", "Add pasta"], | |
"audio": { | |
"backgroundMusic": "light jazz", | |
"dialogue": ["Today we are making pasta..."] | |
}, | |
"visualElements": ["pasta", "saucepan", "spoon"] | |
} | |
``` | |
# Notes | |
- Consider using video timestamp and scene identification for accurate element referencing. | |
- Evaluate both visual and audio elements for context comprehension. | |
- Ensure that video dynamics like scene changes or motion are accounted for in the synthesis of information.""" | |