Smolagent_to_GAIA_benchmark / vision_agent.py
RCaz's picture
Update vision_agent.py
0f4fb47 verified
@tool
def search_item_ctrl_f(text: str, nth_result: int = 1) -> str:
"""
Searches for text on the current page via Ctrl + F and jumps to the nth occurrence.
Args:
text: The text to search for
nth_result: Which occurrence to jump to (default: 1)
"""
elements = driver.find_elements(By.XPATH, f"//*[contains(text(), '{text}')]")
if nth_result > len(elements):
raise Exception(f"Match n°{nth_result} not found (only {len(elements)} matches found)")
result = f"Found {len(elements)} matches for '{text}'."
elem = elements[nth_result - 1]
driver.execute_script("arguments[0].scrollIntoView(true);", elem)
result += f"Focused on element {nth_result} of {len(elements)}"
return result
@tool
def go_back() -> None:
"""Goes back to previous page."""
driver.back()
@tool
def close_popups() -> str:
"""
Closes any visible modal or pop-up on the page. Use this to dismiss pop-up windows! This does not work on cookie consent banners.
"""
webdriver.ActionChains(driver).send_keys(Keys.ESCAPE).perform()
def save_screenshot(step_log: ActionStep, agent: CodeAgent) -> None:
sleep(1.0) # Let JavaScript animations happen before taking the screenshot
driver = helium.get_driver()
current_step = step_log.step_number
if driver is not None:
for step_logs in agent.logs: # Remove previous screenshots from logs for lean processing
if isinstance(step_log, ActionStep) and step_log.step_number <= current_step - 2:
step_logs.observations_images = None
png_bytes = driver.get_screenshot_as_png()
image = Image.open(BytesIO(png_bytes))
print(f"Captured a browser screenshot: {image.size} pixels")
step_log.observations_images = [image.copy()] # Create a copy to ensure it persists, important!
# Update observations with current URL
url_info = f"Current url: {driver.current_url}"
step_log.observations = url_info if step_logs.observations is None else step_log.observations + "\n" + url_info
return
from smolagents import CodeAgent, OpenAIServerModel, DuckDuckGoSearchTool
model = OpenAIServerModel(model_id="gpt-4o")
############# OpenAIServerModel: Connects to any service that offers an OpenAI API interface.
#model = OpenAIServerModel(
# model_id="gpt-4o",
# api_base="https://api.openai.com/v1",
# api_key=os.environ["OPENAI_API_KEY"],
#)
agent = CodeAgent(
tools=[DuckDuckGoSearchTool(), go_back, close_popups, search_item_ctrl_f],
model=model,
additional_authorized_imports=["helium"],
step_callbacks=[save_screenshot],
max_steps=20,
verbosity_level=2,
)
prompt_analysis="""Extract information from an image by analyzing and interpreting its visual elements to provide a detailed description or identify specific data.
# Steps
1. **Analyze the Image**: Identify key elements such as objects, text, colors, and any notable features or contexts.
2. **Interpret Visual Elements**: Determine the significance or purpose of the elements identified. Consider relationships between objects, text recognition if applicable, and any context clues.
3. **Synthesize Information**: Bring together the interpreted elements to form a coherent understanding or summary.
4. **Verify Details**: Ensure accuracy by cross-referencing identifiable text or icons with known data or references, if relevant.
# Output Format
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each identified element should be clearly described along with its context or significance.
# Example
**Input**: (An image with a storefront displaying 'Bakery' sign and a variety of bread on display.)
**Output**:
Description:
- **Storefront**: A bakery
- **Signage Text**: "Bakery"
- **Products**: Various types of bread
JSON Example:
```json
{
"storeType": "Bakery",
"signText": "Bakery",
"products": ["bread", "baguette", "pastry"]
}
```
# Notes
- Consider optical character recognition (OCR) for text extraction.
- Evaluate colors and objects for brand or function associations.
- Provide a holistic overview rather than disjointed elements when possible."""
prompt_deep_analysis="""Extract information from a video by analyzing and interpreting its audiovisual elements to provide a detailed description or identify specific data.
You will have specific information to retrieve from the video. Adapt analysis steps to cater for motion, audio, and potential scene changes unique to video content.
# Steps
1. **Parse the Video**: Break down the video into manageable segments, focusing on scenes or timeframes relevant to the target information.
2. **Identify Key Elements**: Within these segments, identify crucial visual and audio elements such as objects, text, dialogue, sounds, and any notable features or contexts.
3. **Interpret Audiovisual Elements**: Determine the significance or purpose of the identified elements. Consider relationships between objects, text recognition, audio cues, and any context provided by the video.
4. **Synthesize Information**: Integrate the interpreted elements to form a coherent understanding or summary.
5. **Verify Details**: Ensure accuracy by cross-referencing identifiable text, icons, or audio snippets with known data or references, if relevant.
# Output Format
The output should be a detailed text description or a structured data response (such as a JSON) containing the identified elements and their interpretations. Each element should be described along with its context or significance within the video.
# Examples
**Input**: (A video of a cooking show with captions and background music.)
**Output**:
Description:
- **Scene**: Cooking demonstration of a pasta dish
- **Captions**: Step-by-step instructions
- **Audio**: Background music, presenter dialogue
- **Visual Elements**: Ingredients and cooking utensils
JSON Example:
```json
{
"sceneType": "Cooking Demonstration",
"captions": ["Boil water", "Add pasta"],
"audio": {
"backgroundMusic": "light jazz",
"dialogue": ["Today we are making pasta..."]
},
"visualElements": ["pasta", "saucepan", "spoon"]
}
```
# Notes
- Consider using video timestamp and scene identification for accurate element referencing.
- Evaluate both visual and audio elements for context comprehension.
- Ensure that video dynamics like scene changes or motion are accounted for in the synthesis of information."""