Spaces:
Sleeping
Sleeping
Upload 245 files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- BrowserGym/.gitignore +154 -0
- BrowserGym/.pre-commit-config.yaml +44 -0
- BrowserGym/.readthedocs.yaml +32 -0
- BrowserGym/LICENSE +13 -0
- BrowserGym/Makefile +17 -0
- BrowserGym/README.md +254 -0
- BrowserGym/browsergym/assistantbench/README.md +21 -0
- BrowserGym/browsergym/assistantbench/pyproject.toml +35 -0
- BrowserGym/browsergym/assistantbench/requirements.txt +4 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py +54 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py +68 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py +28 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py +34 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py +174 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py +25 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py +132 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py +142 -0
- BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py +73 -0
- BrowserGym/browsergym/browsergym.egg-info/PKG-INFO +22 -0
- BrowserGym/browsergym/browsergym.egg-info/SOURCES.txt +6 -0
- BrowserGym/browsergym/browsergym.egg-info/dependency_links.txt +1 -0
- BrowserGym/browsergym/browsergym.egg-info/requires.txt +8 -0
- BrowserGym/browsergym/browsergym.egg-info/top_level.txt +1 -0
- BrowserGym/browsergym/core/README.md +10 -0
- BrowserGym/browsergym/core/pyproject.toml +42 -0
- BrowserGym/browsergym/core/requirements.txt +8 -0
- BrowserGym/browsergym/core/src/browsergym/core/__init__.py +27 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/__init__.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/chat.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/constants.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/env.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/observation.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/registration.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/spaces.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/__pycache__/task.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py +11 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/__init__.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/base.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/functions.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/highlevel.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/parsers.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/utils.cpython-311.pyc +0 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/base.py +63 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/functions.py +624 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py +522 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py +92 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/python.py +112 -0
- BrowserGym/browsergym/core/src/browsergym/core/action/utils.py +288 -0
- BrowserGym/browsergym/core/src/browsergym/core/chat.py +95 -0
- BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html +243 -0
BrowserGym/.gitignore
ADDED
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.DS_store
|
2 |
+
.idea/
|
3 |
+
docs/src/generated/
|
4 |
+
|
5 |
+
# Byte-compiled / optimized / DLL files
|
6 |
+
__pycache__/
|
7 |
+
*.py[cod]
|
8 |
+
*$py.class
|
9 |
+
|
10 |
+
# C extensions
|
11 |
+
*.so
|
12 |
+
|
13 |
+
# Distribution / packaging
|
14 |
+
.Python
|
15 |
+
build/
|
16 |
+
develop-eggs/
|
17 |
+
dist/
|
18 |
+
downloads/
|
19 |
+
eggs/
|
20 |
+
.eggs/
|
21 |
+
lib/
|
22 |
+
lib64/
|
23 |
+
parts/
|
24 |
+
sdist/
|
25 |
+
var/
|
26 |
+
wheels/
|
27 |
+
pip-wheel-metadata/
|
28 |
+
share/python-wheels/
|
29 |
+
*.egg-info/
|
30 |
+
.installed.cfg
|
31 |
+
*.egg
|
32 |
+
MANIFEST
|
33 |
+
|
34 |
+
# PyInstaller
|
35 |
+
# Usually these files are written by a python script from a template
|
36 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
37 |
+
*.manifest
|
38 |
+
*.spec
|
39 |
+
|
40 |
+
# Installer logs
|
41 |
+
pip-log.txt
|
42 |
+
pip-delete-this-directory.txt
|
43 |
+
|
44 |
+
# Unit test / coverage reports
|
45 |
+
htmlcov/
|
46 |
+
.tox/
|
47 |
+
.nox/
|
48 |
+
.coverage
|
49 |
+
.coverage.*
|
50 |
+
.cache
|
51 |
+
nosetests.xml
|
52 |
+
coverage.xml
|
53 |
+
*.cover
|
54 |
+
*.py,cover
|
55 |
+
.hypothesis/
|
56 |
+
.pytest_cache/
|
57 |
+
|
58 |
+
# Translations
|
59 |
+
*.mo
|
60 |
+
*.pot
|
61 |
+
|
62 |
+
# Django stuff:
|
63 |
+
*.log
|
64 |
+
local_settings.py
|
65 |
+
db.sqlite3
|
66 |
+
db.sqlite3-journal
|
67 |
+
|
68 |
+
# Flask stuff:
|
69 |
+
instance/
|
70 |
+
.webassets-cache
|
71 |
+
|
72 |
+
# Scrapy stuff:
|
73 |
+
.scrapy
|
74 |
+
|
75 |
+
# Sphinx documentation
|
76 |
+
docs/_build/
|
77 |
+
|
78 |
+
# PyBuilder
|
79 |
+
target/
|
80 |
+
|
81 |
+
# Jupyter Notebook
|
82 |
+
.ipynb_checkpoints
|
83 |
+
|
84 |
+
# IPython
|
85 |
+
profile_default/
|
86 |
+
ipython_config.py
|
87 |
+
|
88 |
+
# pyenv
|
89 |
+
.python-version
|
90 |
+
|
91 |
+
# pipenv
|
92 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
93 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
94 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
95 |
+
# install all needed dependencies.
|
96 |
+
#Pipfile.lock
|
97 |
+
|
98 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow
|
99 |
+
__pypackages__/
|
100 |
+
|
101 |
+
# Celery stuff
|
102 |
+
celerybeat-schedule
|
103 |
+
celerybeat.pid
|
104 |
+
|
105 |
+
# SageMath parsed files
|
106 |
+
*.sage.py
|
107 |
+
|
108 |
+
# Environments
|
109 |
+
.env
|
110 |
+
.venv
|
111 |
+
env/
|
112 |
+
venv/
|
113 |
+
ENV/
|
114 |
+
env.bak/
|
115 |
+
venv.bak/
|
116 |
+
|
117 |
+
# Spyder project settings
|
118 |
+
.spyderproject
|
119 |
+
.spyproject
|
120 |
+
|
121 |
+
# Rope project settings
|
122 |
+
.ropeproject
|
123 |
+
|
124 |
+
# mkdocs documentation
|
125 |
+
/site
|
126 |
+
|
127 |
+
# mypy
|
128 |
+
.mypy_cache/
|
129 |
+
.dmypy.json
|
130 |
+
dmypy.json
|
131 |
+
|
132 |
+
# Pyre type checker
|
133 |
+
.pyre/
|
134 |
+
|
135 |
+
# error logs
|
136 |
+
error_logs.txt
|
137 |
+
|
138 |
+
# tests
|
139 |
+
tests/results
|
140 |
+
tmp.py
|
141 |
+
.vscode/**
|
142 |
+
|
143 |
+
# demo and results
|
144 |
+
results/
|
145 |
+
|
146 |
+
.vscode/launch.json
|
147 |
+
|
148 |
+
# assistantbench
|
149 |
+
tests/assistantbench/assistantbench-predictions-test.jsonl
|
150 |
+
|
151 |
+
# weblinx
|
152 |
+
bg_wl_data/
|
153 |
+
|
154 |
+
uv.lock
|
BrowserGym/.pre-commit-config.yaml
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
fail_fast: false
|
2 |
+
|
3 |
+
default_language_version:
|
4 |
+
python: python3
|
5 |
+
|
6 |
+
repos:
|
7 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
8 |
+
rev: v4.2.0
|
9 |
+
hooks:
|
10 |
+
- id: trailing-whitespace
|
11 |
+
exclude: ^(.*)\.md$
|
12 |
+
- id: end-of-file-fixer
|
13 |
+
- id: check-yaml
|
14 |
+
exclude: ^(.circleci/recipe|recipe) # conda build recipes are templated
|
15 |
+
- id: check-added-large-files
|
16 |
+
- repo: https://github.com/pocc/pre-commit-hooks
|
17 |
+
rev: v1.1.1
|
18 |
+
hooks:
|
19 |
+
- id: clang-format
|
20 |
+
args: [--style=file, -i]
|
21 |
+
- id: clang-tidy
|
22 |
+
args: [--fix, --fix-errors]
|
23 |
+
- repo: https://github.com/psf/black
|
24 |
+
rev: 24.2.0
|
25 |
+
hooks:
|
26 |
+
- id: black
|
27 |
+
args: [--config=./pyproject.toml]
|
28 |
+
- repo: https://github.com/asottile/blacken-docs
|
29 |
+
rev: v1.12.1
|
30 |
+
hooks:
|
31 |
+
- id: blacken-docs
|
32 |
+
args: [ '--line-length', '100' ]
|
33 |
+
additional_dependencies: [black]
|
34 |
+
- repo: https://github.com/Lucas-C/pre-commit-hooks
|
35 |
+
rev: v1.5.5
|
36 |
+
hooks:
|
37 |
+
- id: forbid-crlf
|
38 |
+
- id: remove-crlf
|
39 |
+
# Black does not clear tabs in docstrings
|
40 |
+
- id: forbid-tabs
|
41 |
+
files: '.*\.py$'
|
42 |
+
- id: remove-tabs
|
43 |
+
files: '.*\.py$'
|
44 |
+
args: [ '--whitespaces-count', '4' ]
|
BrowserGym/.readthedocs.yaml
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# .readthedocs.yaml
|
2 |
+
# Read the Docs configuration file
|
3 |
+
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
|
4 |
+
|
5 |
+
# Required
|
6 |
+
version: 2
|
7 |
+
|
8 |
+
# Set the OS, Python version and other tools you might need
|
9 |
+
build:
|
10 |
+
os: ubuntu-22.04
|
11 |
+
tools:
|
12 |
+
python: "3.12"
|
13 |
+
# You can also specify other tool versions:
|
14 |
+
# nodejs: "19"
|
15 |
+
# rust: "1.64"
|
16 |
+
# golang: "1.19"
|
17 |
+
|
18 |
+
# Build documentation in the "docs/" directory with Sphinx
|
19 |
+
sphinx:
|
20 |
+
configuration: docs/src/conf.py
|
21 |
+
|
22 |
+
# Optionally build your docs in additional formats such as PDF and ePub
|
23 |
+
# formats:
|
24 |
+
# - pdf
|
25 |
+
# - epub
|
26 |
+
|
27 |
+
# Optional but recommended, declare the Python requirements required
|
28 |
+
# to build your documentation
|
29 |
+
# See https://docs.readthedocs.io/en/stable/guides/reproducible-builds.html
|
30 |
+
python:
|
31 |
+
install:
|
32 |
+
- requirements: docs/requirements.txt
|
BrowserGym/LICENSE
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Copyright 2024 ServiceNow
|
2 |
+
|
3 |
+
Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
you may not use this file except in compliance with the License.
|
5 |
+
You may obtain a copy of the License at
|
6 |
+
|
7 |
+
http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
|
9 |
+
Unless required by applicable law or agreed to in writing, software
|
10 |
+
distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
See the License for the specific language governing permissions and
|
13 |
+
limitations under the License.
|
BrowserGym/Makefile
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
install:
|
2 |
+
@echo "--- 🚀 Installing project dependencies ---"
|
3 |
+
pip install -e ./browsergym/core -e ./browsergym/miniwob -e ./browsergym/webarena -e ./browsergym/visualwebarena/ -e ./browsergym/experiments -e ./browsergym/assistantbench -e ./browsergym/
|
4 |
+
playwright install chromium
|
5 |
+
|
6 |
+
install-demo:
|
7 |
+
@echo "--- 🚀 Installing demo dependencies ---"
|
8 |
+
pip install -r demo_agent/requirements.txt
|
9 |
+
playwright install chromium
|
10 |
+
|
11 |
+
demo:
|
12 |
+
@echo "--- 🚀 Running demo agent ---"
|
13 |
+
(set -x && cd demo_agent && python run_demo.py)
|
14 |
+
|
15 |
+
test-core:
|
16 |
+
@echo "--- 🧪 Running tests ---"
|
17 |
+
pytest -n auto ./tests/core
|
BrowserGym/README.md
ADDED
@@ -0,0 +1,254 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<div align="center">
|
2 |
+
|
3 |
+

|
4 |
+
|
5 |
+
🛠️ [Setup](#%EF%B8%8F-setup) -
|
6 |
+
🏋 [Usage](#-usage) -
|
7 |
+
💻 [Demo](#-demo) -
|
8 |
+
🌐 [Ecosystem](#-ecosystem) -
|
9 |
+
🚀 [AgentLab](https://github.com/ServiceNow/AgentLab) -
|
10 |
+
🌟 [Contributors](#-contributors) -
|
11 |
+
📄 [Paper](https://arxiv.org/abs/2412.05467) -
|
12 |
+
📝 [Citation](#-citing-this-work)
|
13 |
+
|
14 |
+
[](https://pypi.org/project/browsergym/)
|
15 |
+
[]([https://opensource.org/licenses/MIT](http://www.apache.org/licenses/LICENSE-2.0))
|
16 |
+
[](https://pypistats.org/packages/browsergym-core)
|
17 |
+
[](https://star-history.com/#ServiceNow/BrowserGym)
|
18 |
+
[](https://github.com/ServiceNow/BrowserGym/actions/workflows/code_format.yml)
|
19 |
+
[](https://github.com/ServiceNow/BrowserGym/actions/workflows/unit_tests.yml)
|
20 |
+
|
21 |
+
```python
|
22 |
+
pip install browsergym
|
23 |
+
```
|
24 |
+
|
25 |
+
</div>
|
26 |
+
|
27 |
+
> [!WARNING]
|
28 |
+
> BrowserGym is meant to provide an open, easy-to-use and extensible framework to accelerate the field of web agent research.
|
29 |
+
> It is not meant to be a consumer product. Use with caution!
|
30 |
+
|
31 |
+
> [!TIP]
|
32 |
+
> 🚀 Check out [AgentLab](https://github.com/ServiceNow/AgentLab)✨ !
|
33 |
+
> A seamless framework to implement, test, and evaluate your web agents on all BrowserGym benchmarks.
|
34 |
+
|
35 |
+
https://github.com/ServiceNow/BrowserGym/assets/26232819/e0bfc788-cc8e-44f1-b8c3-0d1114108b85
|
36 |
+
|
37 |
+
_Example of a GPT4-V agent executing openended tasks (top row, chat interactive), as well as WebArena and WorkArena tasks (bottom row)._
|
38 |
+
|
39 |
+
BrowserGym includes the following benchmarks by default:
|
40 |
+
- [MiniWoB](https://miniwob.farama.org/)
|
41 |
+
- [WebArena](https://webarena.dev/)
|
42 |
+
- [VisualWebArena](https://jykoh.com/vwa)
|
43 |
+
- [WorkArena](https://github.com/ServiceNow/WorkArena)
|
44 |
+
- [AssistantBench](https://github.com/oriyor/assistantbench)
|
45 |
+
- [WebLINX](https://github.com/McGill-NLP/weblinx) (static benchmark)
|
46 |
+
|
47 |
+
Designing new web benchmarks with BrowserGym is easy, and simply requires to inherit the [`AbstractBrowserTask`](https://github.com/ServiceNow/BrowserGym/blob/main/browsergym/core/src/browsergym/core/task.py#L7C7-L7C26) class.
|
48 |
+
|
49 |
+
## 🛠️ Setup
|
50 |
+
|
51 |
+
To use browsergym, install one of the following packages:
|
52 |
+
```sh
|
53 |
+
pip install browsergym # (recommended) everything below
|
54 |
+
pip install browsergym-experiments # experiment utilities (agent, loop, benchmarks) + everything below
|
55 |
+
pip install browsergym-core # core functionalities only (no benchmark, just the openended task)
|
56 |
+
pip install browsergym-miniwob # core + miniwob
|
57 |
+
pip install browsergym-webarena # core + webarena
|
58 |
+
pip install browsergym-visualwebarena # core + visualwebarena
|
59 |
+
pip install browsergym-workarena # core + workarena
|
60 |
+
pip install browsergym-assistantbench # core + assistantbench
|
61 |
+
pip install weblinx-browsergym # core + weblinx
|
62 |
+
```
|
63 |
+
|
64 |
+
Then setup playwright by running
|
65 |
+
```sh
|
66 |
+
playwright install chromium
|
67 |
+
```
|
68 |
+
|
69 |
+
Finally, each benchmark comes with its own specific setup that requires to follow additional steps.
|
70 |
+
- for MiniWoB++, see [miniwob/README.md](browsergym/miniwob/README.md)
|
71 |
+
- for WebArena, see [webarena/README.md](browsergym/webarena/README.md)
|
72 |
+
- for VisualWebArena, see [visualwebarena/README.md](browsergym/visualwebarena/README.md)
|
73 |
+
- for WorkArena, see [WorkArena](https://github.com/ServiceNow/WorkArena)
|
74 |
+
- for AssistantBench, see [assistantbench/README.md](browsergym/assistantbench/README.md)
|
75 |
+
|
76 |
+
### 🏗️ Development setup
|
77 |
+
|
78 |
+
To install browsergym locally for development, use the following commands:
|
79 |
+
```sh
|
80 |
+
git clone git@github.com:ServiceNow/BrowserGym.git
|
81 |
+
cd BrowserGym
|
82 |
+
make install
|
83 |
+
```
|
84 |
+
|
85 |
+
Contributions are welcome! 😊
|
86 |
+
|
87 |
+
## 🏋 Usage
|
88 |
+
|
89 |
+
Boilerplate code to run an agent on an interactive, open-ended task:
|
90 |
+
```python
|
91 |
+
import gymnasium as gym
|
92 |
+
import browsergym.core # register the openended task as a gym environment
|
93 |
+
|
94 |
+
# start an openended environment
|
95 |
+
env = gym.make(
|
96 |
+
"browsergym/openended",
|
97 |
+
task_kwargs={"start_url": "https://www.google.com/"}, # starting URL
|
98 |
+
wait_for_user_message=True, # wait for a user message after each agent message sent to the chat
|
99 |
+
)
|
100 |
+
# run the environment <> agent loop until termination
|
101 |
+
obs, info = env.reset()
|
102 |
+
while True:
|
103 |
+
action = ... # implement your agent here
|
104 |
+
obs, reward, terminated, truncated, info = env.step(action)
|
105 |
+
if terminated or truncated:
|
106 |
+
break
|
107 |
+
# release the environment
|
108 |
+
env.close()
|
109 |
+
```
|
110 |
+
|
111 |
+
MiniWoB
|
112 |
+
```python
|
113 |
+
import gymnasium as gym
|
114 |
+
import browsergym.miniwob # register miniwob tasks as gym environments
|
115 |
+
|
116 |
+
# start a miniwob task
|
117 |
+
env = gym.make("browsergym/miniwob.choose-list")
|
118 |
+
...
|
119 |
+
|
120 |
+
# list all the available miniwob tasks
|
121 |
+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/miniwob")]
|
122 |
+
print("\n".join(env_ids))
|
123 |
+
```
|
124 |
+
|
125 |
+
WorkArena
|
126 |
+
```python
|
127 |
+
import gymnasium as gym
|
128 |
+
import browsergym.workarena # register workarena tasks as gym environments
|
129 |
+
|
130 |
+
# start a workarena task
|
131 |
+
env = gym.make("browsergym/workarena.servicenow.order-ipad-pro")
|
132 |
+
...
|
133 |
+
|
134 |
+
# list all the available workarena tasks
|
135 |
+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
|
136 |
+
print("\n".join(env_ids))
|
137 |
+
```
|
138 |
+
|
139 |
+
WebArena
|
140 |
+
```python
|
141 |
+
import gymnasium as gym
|
142 |
+
import browsergym.webarena # register webarena tasks as gym environments
|
143 |
+
|
144 |
+
# start a webarena task
|
145 |
+
env = gym.make("browsergym/webarena.310")
|
146 |
+
...
|
147 |
+
|
148 |
+
# list all the available webarena tasks
|
149 |
+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/webarena")]
|
150 |
+
print("\n".join(env_ids))
|
151 |
+
```
|
152 |
+
|
153 |
+
VisualWebArena
|
154 |
+
```python
|
155 |
+
import gymnasium as gym
|
156 |
+
import browsergym.webarena # register webarena tasks as gym environments
|
157 |
+
|
158 |
+
# start a visualwebarena task
|
159 |
+
env = gym.make("browsergym/visualwebarena.721")
|
160 |
+
...
|
161 |
+
|
162 |
+
# list all the available visualwebarena tasks
|
163 |
+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/visualwebarena")]
|
164 |
+
print("\n".join(env_ids))
|
165 |
+
```
|
166 |
+
|
167 |
+
AssistantBench
|
168 |
+
```python
|
169 |
+
import gymnasium as gym
|
170 |
+
import browsergym.workarena # register assistantbench tasks as gym environments
|
171 |
+
|
172 |
+
# start an assistantbench task
|
173 |
+
env = gym.make("browsergym/assistantbench.validation.3")
|
174 |
+
...
|
175 |
+
|
176 |
+
# list all the available assistantbench tasks
|
177 |
+
env_ids = [id for id in gym.envs.registry.keys() if id.startswith("browsergym/workarena")]
|
178 |
+
print("\n".join(env_ids))
|
179 |
+
```
|
180 |
+
|
181 |
+
## 💻 Demo
|
182 |
+
|
183 |
+
If you want to experiment with a demo agent in BrowserGym, follow these steps
|
184 |
+
```sh
|
185 |
+
# conda setup
|
186 |
+
conda env create -f demo_agent/environment.yml
|
187 |
+
conda activate demo_agent
|
188 |
+
|
189 |
+
# or pip setup
|
190 |
+
pip install -r demo_agent/requirements.txt
|
191 |
+
|
192 |
+
# then download the browser for playwright
|
193 |
+
playwright install chromium
|
194 |
+
```
|
195 |
+
|
196 |
+
Our demo agent uses `openai` as a backend, be sure to set your `OPENAI_API_KEY`.
|
197 |
+
|
198 |
+
Launch the demo agent as follows
|
199 |
+
```sh
|
200 |
+
# openended (interactive chat mode)
|
201 |
+
python demo_agent/run_demo.py --task_name openended --start_url https://www.google.com
|
202 |
+
|
203 |
+
# miniwob
|
204 |
+
python demo_agent/run_demo.py --task_name miniwob.click-test
|
205 |
+
|
206 |
+
# workarena
|
207 |
+
python demo_agent/run_demo.py --task_name workarena.servicenow.order-standard-laptop
|
208 |
+
|
209 |
+
# webarena
|
210 |
+
python demo_agent/run_demo.py --task_name webarena.4
|
211 |
+
|
212 |
+
# visualwebarena
|
213 |
+
python demo_agent/run_demo.py --task_name visualwebarena.398
|
214 |
+
```
|
215 |
+
|
216 |
+
You can customize your experience by changing the `model_name` to your preferred LLM (it uses `gpt-4o-mini` by default), adding screenshots for your VLMs with `use_screenshot`, and much more!
|
217 |
+
|
218 |
+
```python
|
219 |
+
python demo_agent/run_demo.py --help
|
220 |
+
```
|
221 |
+
|
222 |
+
## 🌐 Ecosystem
|
223 |
+
|
224 |
+
- [AgentLab](https://github.com/ServiceNow/AgentLab): Seamlessly run agents on benchmarks, collect and analyse traces.
|
225 |
+
- [WorkArena(++)](https://github.com/ServiceNow/WorkArena): A benchmark for web agents on the ServiceNow platform.
|
226 |
+
- [WebArena](https://github.com/web-arena-x/webarena): A benchmark of realistic web tasks on self-hosted domains.
|
227 |
+
- [VisualWebArena](https://github.com/web-arena-x/visualwebarena): A benchmark of realistic visual web tasks on self-hosted domains.
|
228 |
+
- [MiniWoB(++)](https://miniwob.farama.org/): A collection of over 100 web tasks on synthetic web pages.
|
229 |
+
- [WebLINX](https://github.com/McGill-NLP/weblinx): A dataset of real-world web interaction traces.
|
230 |
+
- [AssistantBench](https://github.com/oriyor/assistantbench): A benchmark of realistic and time-consuming tasks on the open web.
|
231 |
+
- [DoomArena](https://github.com/ServiceNow/DoomArena): A framework for AI agent security testing which supports injecting attacks into web pages from Browsergym environments.
|
232 |
+
|
233 |
+
## 🌟 Contributors
|
234 |
+
|
235 |
+
[](https://github.com/ServiceNow/BrowserGym/graphs/contributors)
|
236 |
+
|
237 |
+
## 📝 Citing This Work
|
238 |
+
|
239 |
+
Please use the following BibTeX to cite our work:
|
240 |
+
```tex
|
241 |
+
@inproceedings{workarena2024,
|
242 |
+
title = {{W}ork{A}rena: How Capable are Web Agents at Solving Common Knowledge Work Tasks?},
|
243 |
+
author = {Drouin, Alexandre and Gasse, Maxime and Caccia, Massimo and Laradji, Issam H. and Del Verme, Manuel and Marty, Tom and Vazquez, David and Chapados, Nicolas and Lacoste, Alexandre},
|
244 |
+
booktitle = {Proceedings of the 41st International Conference on Machine Learning},
|
245 |
+
pages = {11642--11662},
|
246 |
+
year = {2024},
|
247 |
+
editor = {Salakhutdinov, Ruslan and Kolter, Zico and Heller, Katherine and Weller, Adrian and Oliver, Nuria and Scarlett, Jonathan and Berkenkamp, Felix},
|
248 |
+
volume = {235},
|
249 |
+
series = {Proceedings of Machine Learning Research},
|
250 |
+
month = {21--27 Jul},
|
251 |
+
publisher = {PMLR},
|
252 |
+
url = {https://proceedings.mlr.press/v235/drouin24a.html},
|
253 |
+
}
|
254 |
+
```
|
BrowserGym/browsergym/assistantbench/README.md
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# AssistantBench <> BrowserGym
|
2 |
+
|
3 |
+
This package provides an implementation for using the [AssistantBench](https://assistantbench.github.io/) benchmark in BrowserGym.
|
4 |
+
|
5 |
+
Because AssistantBench includes open-ended tasks, setup is extremely easy and simply requires installing the package.
|
6 |
+
|
7 |
+
Please note that AssistantBench has a hidden test set, so test set predictions will need to be uploaded to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
|
8 |
+
|
9 |
+
## Setting up
|
10 |
+
|
11 |
+
- Install the package (this is still a wip)
|
12 |
+
```
|
13 |
+
pip install browsergym-assistantbench
|
14 |
+
```
|
15 |
+
|
16 |
+
- Run inference, e.g., run the following commands for demo on a simple toy task
|
17 |
+
```
|
18 |
+
python demo_agent/run_demo.py --task_name assistantbench.validation.3
|
19 |
+
```
|
20 |
+
|
21 |
+
- Test set predictions will be saved to `./assistantbench-predictions-test.jsonl`. To evaluate on the official test set, upload these predictions to the official [leaderboard](https://huggingface.co/spaces/AssistantBench/leaderboard).
|
BrowserGym/browsergym/assistantbench/pyproject.toml
ADDED
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["hatchling", "hatch-requirements-txt"]
|
3 |
+
build-backend = "hatchling.build"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "browsergym-assistantbench"
|
7 |
+
description = "AssistantBench benchmark for BrowserGym"
|
8 |
+
authors = [
|
9 |
+
{name = "Ori Yoran"},
|
10 |
+
{name = "Maxime Gasse"},
|
11 |
+
]
|
12 |
+
readme = "README.md"
|
13 |
+
requires-python = ">3.7"
|
14 |
+
license = {text = "Apache-2.0"}
|
15 |
+
classifiers = [
|
16 |
+
"Development Status :: 3 - Alpha",
|
17 |
+
"Programming Language :: Python :: 3",
|
18 |
+
"Operating System :: OS Independent",
|
19 |
+
"Intended Audience :: Science/Research",
|
20 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
21 |
+
"License :: OSI Approved :: Apache Software License",
|
22 |
+
]
|
23 |
+
dynamic = ["dependencies", "version"]
|
24 |
+
|
25 |
+
[project.urls]
|
26 |
+
homepage = "https://github.com/ServiceNow/BrowserGym"
|
27 |
+
|
28 |
+
[tool.hatch.version]
|
29 |
+
path = "../core/src/browsergym/core/__init__.py"
|
30 |
+
|
31 |
+
[tool.hatch.metadata.hooks.requirements_txt]
|
32 |
+
files = ["requirements.txt"]
|
33 |
+
|
34 |
+
[tool.hatch.build.targets.wheel]
|
35 |
+
packages = ["src/browsergym"]
|
BrowserGym/browsergym/assistantbench/requirements.txt
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
browsergym-core==0.13.4
|
2 |
+
datasets
|
3 |
+
scipy
|
4 |
+
numpy
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/__init__.py
ADDED
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from browsergym.core.registration import register_task
|
2 |
+
|
3 |
+
from . import task
|
4 |
+
|
5 |
+
TOY_AB_TASK_IDS = []
|
6 |
+
VALID_AB_TASK_IDS = []
|
7 |
+
TEST_AB_TASK_IDS = []
|
8 |
+
|
9 |
+
|
10 |
+
# register a toy easy task for testing implementation
|
11 |
+
gym_id = f"assistantbench.imp.0"
|
12 |
+
register_task(
|
13 |
+
gym_id,
|
14 |
+
task.AssistantBenchTask,
|
15 |
+
task_kwargs={
|
16 |
+
"task_id": f"imp.0",
|
17 |
+
},
|
18 |
+
default_task_kwargs={
|
19 |
+
"save_predictions": False, # can be overriden
|
20 |
+
},
|
21 |
+
)
|
22 |
+
TOY_AB_TASK_IDS.append(gym_id)
|
23 |
+
|
24 |
+
# register the AssistantBench dev set
|
25 |
+
for task_id in range(33):
|
26 |
+
gym_id = f"assistantbench.validation.{task_id}"
|
27 |
+
register_task(
|
28 |
+
gym_id,
|
29 |
+
task.AssistantBenchTask,
|
30 |
+
task_kwargs={
|
31 |
+
"task_id": f"validation.{task_id}",
|
32 |
+
},
|
33 |
+
default_task_kwargs={
|
34 |
+
"save_predictions": False, # can be overriden
|
35 |
+
},
|
36 |
+
)
|
37 |
+
VALID_AB_TASK_IDS.append(gym_id)
|
38 |
+
|
39 |
+
# register the AssistantBench test set
|
40 |
+
for task_id in range(181):
|
41 |
+
gym_id = f"assistantbench.test.{task_id}"
|
42 |
+
register_task(
|
43 |
+
gym_id,
|
44 |
+
task.AssistantBenchTask,
|
45 |
+
task_kwargs={
|
46 |
+
"task_id": f"test.{task_id}",
|
47 |
+
},
|
48 |
+
default_task_kwargs={
|
49 |
+
"save_predictions": True, # can be overriden
|
50 |
+
},
|
51 |
+
)
|
52 |
+
TEST_AB_TASK_IDS.append(gym_id)
|
53 |
+
|
54 |
+
ALL_AB_TASK_IDS = TOY_AB_TASK_IDS + VALID_AB_TASK_IDS + TEST_AB_TASK_IDS
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_dicts.py
ADDED
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Dict, List
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
from .utils import _align_bags
|
6 |
+
|
7 |
+
|
8 |
+
def calculate_f1_score(precision, recall):
|
9 |
+
if precision + recall == 0:
|
10 |
+
return 0 # Handle the case to avoid division by zero
|
11 |
+
return 2 * (precision * recall) / (precision + recall)
|
12 |
+
|
13 |
+
|
14 |
+
def calc_recall(pred: Dict, gold: Dict, use_gold_for_eval: bool):
|
15 |
+
from .evaluate_factory import get_evaluator_from_gold_answer
|
16 |
+
|
17 |
+
recall = []
|
18 |
+
for gold_key, gold_value in gold.items():
|
19 |
+
pred_value = pred.get(gold_key)
|
20 |
+
gold_value = fix_number(gold_value)
|
21 |
+
pred_value = fix_number(pred_value)
|
22 |
+
if gold_key not in pred:
|
23 |
+
recall.append(0)
|
24 |
+
else:
|
25 |
+
evaluator = (
|
26 |
+
get_evaluator_from_gold_answer(type(gold_value))
|
27 |
+
if use_gold_for_eval
|
28 |
+
else get_evaluator_from_gold_answer(type(pred_value))
|
29 |
+
)
|
30 |
+
if type(pred_value) != type(gold_value):
|
31 |
+
recall.append(0)
|
32 |
+
continue
|
33 |
+
recall.append(evaluator(pred_value, gold_value))
|
34 |
+
avg_recall = np.average(recall)
|
35 |
+
return avg_recall
|
36 |
+
|
37 |
+
|
38 |
+
def fix_number(number):
|
39 |
+
|
40 |
+
if type(number) == str:
|
41 |
+
copy_ans = number
|
42 |
+
copy_ans = " ".join(
|
43 |
+
" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
|
44 |
+
).strip()
|
45 |
+
copy_ans = copy_ans.strip()
|
46 |
+
copy_ans = copy_ans.replace(",", ".")
|
47 |
+
try:
|
48 |
+
return float(copy_ans)
|
49 |
+
except:
|
50 |
+
return number
|
51 |
+
elif type(number) == int:
|
52 |
+
return float(number)
|
53 |
+
else:
|
54 |
+
return number
|
55 |
+
|
56 |
+
|
57 |
+
def evaluate_pair_of_dicts(pred: Dict, gold: Dict):
|
58 |
+
recall = calc_recall(pred, gold, True)
|
59 |
+
precision = calc_recall(gold, pred, False)
|
60 |
+
f1 = calculate_f1_score(precision, recall)
|
61 |
+
return f1
|
62 |
+
|
63 |
+
|
64 |
+
def evaluate_dicts(pred: List[Dict], gold: List[Dict]):
|
65 |
+
if not (type(pred) == dict or len(pred) == 0 or (type(pred) == list and type(pred[0]) == dict)):
|
66 |
+
return 0
|
67 |
+
max_alignment_scores = _align_bags(pred, gold, evaluate_pair_of_dicts)
|
68 |
+
return np.average(max_alignment_scores)
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_factory.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
from .evaluate_dicts import evaluate_dicts
|
4 |
+
from .evaluate_numbers import evaluate_numbers
|
5 |
+
from .evaluate_strings import evaluate_strings
|
6 |
+
|
7 |
+
EvaluatorFactory = {
|
8 |
+
"string": evaluate_strings,
|
9 |
+
"number": evaluate_numbers,
|
10 |
+
"json": evaluate_dicts,
|
11 |
+
"string list": evaluate_strings,
|
12 |
+
}
|
13 |
+
|
14 |
+
EvaluatorFactoryFromType = {
|
15 |
+
str: evaluate_strings,
|
16 |
+
int: evaluate_numbers,
|
17 |
+
float: evaluate_numbers,
|
18 |
+
bool: evaluate_strings,
|
19 |
+
list: evaluate_strings,
|
20 |
+
}
|
21 |
+
|
22 |
+
|
23 |
+
def get_evaluator(evaluator: str):
|
24 |
+
return EvaluatorFactory[evaluator]
|
25 |
+
|
26 |
+
|
27 |
+
def get_evaluator_from_gold_answer(gold_answer: Union[str, int, float]):
|
28 |
+
return EvaluatorFactoryFromType[gold_answer]
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_numbers.py
ADDED
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Union
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
# Renamed calc_z function to distance_function_log
|
7 |
+
def distance_function_log(pred: float, gold: float):
|
8 |
+
if pred == gold == 0:
|
9 |
+
return 1
|
10 |
+
if pred == 0:
|
11 |
+
pred = 1e-4
|
12 |
+
if gold == 0:
|
13 |
+
gold = 1e-4
|
14 |
+
if pred > gold:
|
15 |
+
return max(0, 1 - np.log(pred / gold))
|
16 |
+
else:
|
17 |
+
return max(0, 1 - np.log(gold / pred))
|
18 |
+
|
19 |
+
|
20 |
+
def evaluate_numbers(pred: Union[float, str], gold: float):
|
21 |
+
res = None
|
22 |
+
if type(pred) != float and type(pred) != int:
|
23 |
+
try:
|
24 |
+
pred = float(pred)
|
25 |
+
except ValueError:
|
26 |
+
res = 0
|
27 |
+
if type(gold) != float and type(gold) != int:
|
28 |
+
try:
|
29 |
+
gold = float(gold)
|
30 |
+
except ValueError:
|
31 |
+
res = 0
|
32 |
+
if res is None:
|
33 |
+
res = distance_function_log(pred, gold)
|
34 |
+
return res
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/evaluate_strings.py
ADDED
@@ -0,0 +1,174 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""
|
2 |
+
Evaluation for two strings or list of strings.
|
3 |
+
Code taken from the DROP benchmark - https://github.com/allenai/allennlp-reading-comprehension/blob/master/allennlp_rc/eval/drop_eval.py
|
4 |
+
"""
|
5 |
+
|
6 |
+
import re
|
7 |
+
import string
|
8 |
+
from typing import List, Set, Tuple, Union
|
9 |
+
|
10 |
+
import numpy as np
|
11 |
+
from scipy.optimize import linear_sum_assignment
|
12 |
+
|
13 |
+
|
14 |
+
# From here through _normalize_answer was originally copied from:
|
15 |
+
# https://worksheets.codalab.org/rest/bundles/0x6b567e1cf2e041ec80d7098f031c5c9e/contents/blob/
|
16 |
+
# Then cleaned up and modified a bit.
|
17 |
+
def _remove_articles(text: str) -> str:
|
18 |
+
regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
|
19 |
+
return re.sub(regex, " ", text)
|
20 |
+
|
21 |
+
|
22 |
+
def _white_space_fix(text: str) -> str:
|
23 |
+
return " ".join(text.split())
|
24 |
+
|
25 |
+
|
26 |
+
EXCLUDE = set(string.punctuation)
|
27 |
+
|
28 |
+
|
29 |
+
def _remove_punc(text: str) -> str:
|
30 |
+
if not _is_number(text):
|
31 |
+
return "".join(ch for ch in text if ch not in EXCLUDE)
|
32 |
+
else:
|
33 |
+
return text
|
34 |
+
|
35 |
+
|
36 |
+
def _lower(text: str) -> str:
|
37 |
+
return text.lower()
|
38 |
+
|
39 |
+
|
40 |
+
def _tokenize(text: str) -> List[str]:
|
41 |
+
return re.split(" |-", text)
|
42 |
+
|
43 |
+
|
44 |
+
def _normalize_answer(text: str) -> str:
|
45 |
+
"""Lower text and remove punctuation, articles and extra whitespace."""
|
46 |
+
|
47 |
+
parts = [
|
48 |
+
_white_space_fix(_remove_articles(_normalize_number(_remove_punc(_lower(token)))))
|
49 |
+
for token in _tokenize(text)
|
50 |
+
]
|
51 |
+
parts = [part for part in parts if part.strip()]
|
52 |
+
normalized = " ".join(parts).strip()
|
53 |
+
return normalized
|
54 |
+
|
55 |
+
|
56 |
+
def _is_number(text: str) -> bool:
|
57 |
+
try:
|
58 |
+
float(text)
|
59 |
+
return True
|
60 |
+
except ValueError:
|
61 |
+
return False
|
62 |
+
|
63 |
+
|
64 |
+
def _normalize_number(text: str) -> str:
|
65 |
+
if _is_number(text):
|
66 |
+
return str(float(text))
|
67 |
+
else:
|
68 |
+
return text
|
69 |
+
|
70 |
+
|
71 |
+
def _answer_to_bags(
|
72 |
+
answer: Union[str, List[str], Tuple[str, ...]]
|
73 |
+
) -> Tuple[List[str], List[Set[str]]]:
|
74 |
+
if isinstance(answer, (list, tuple)):
|
75 |
+
raw_spans = answer
|
76 |
+
else:
|
77 |
+
raw_spans = [answer]
|
78 |
+
normalized_spans: List[str] = []
|
79 |
+
token_bags = []
|
80 |
+
for raw_span in raw_spans:
|
81 |
+
normalized_span = _normalize_answer(raw_span)
|
82 |
+
normalized_spans.append(normalized_span)
|
83 |
+
token_bags.append(set(normalized_span.split()))
|
84 |
+
return normalized_spans, token_bags
|
85 |
+
|
86 |
+
|
87 |
+
def _align_bags(predicted: List[Set[str]], gold: List[Set[str]]) -> List[float]:
|
88 |
+
"""
|
89 |
+
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
|
90 |
+
between them and gets maximum metric values over all the answers.
|
91 |
+
"""
|
92 |
+
scores = np.zeros([len(gold), len(predicted)])
|
93 |
+
for gold_index, gold_item in enumerate(gold):
|
94 |
+
for pred_index, pred_item in enumerate(predicted):
|
95 |
+
if _match_numbers_if_present(gold_item, pred_item):
|
96 |
+
scores[gold_index, pred_index] = _compute_f1(pred_item, gold_item)
|
97 |
+
row_ind, col_ind = linear_sum_assignment(-scores)
|
98 |
+
|
99 |
+
max_scores = np.zeros([max(len(gold), len(predicted))])
|
100 |
+
for row, column in zip(row_ind, col_ind):
|
101 |
+
max_scores[row] = max(max_scores[row], scores[row, column])
|
102 |
+
return max_scores
|
103 |
+
|
104 |
+
|
105 |
+
def _compute_f1(predicted_bag: Set[str], gold_bag: Set[str]) -> float:
|
106 |
+
intersection = len(gold_bag.intersection(predicted_bag))
|
107 |
+
if not predicted_bag:
|
108 |
+
precision = 1.0
|
109 |
+
else:
|
110 |
+
precision = intersection / float(len(predicted_bag))
|
111 |
+
if not gold_bag:
|
112 |
+
recall = 1.0
|
113 |
+
else:
|
114 |
+
recall = intersection / float(len(gold_bag))
|
115 |
+
f1 = (
|
116 |
+
(2 * precision * recall) / (precision + recall)
|
117 |
+
if not (precision == 0.0 and recall == 0.0)
|
118 |
+
else 0.0
|
119 |
+
)
|
120 |
+
return f1
|
121 |
+
|
122 |
+
|
123 |
+
def _match_numbers_if_present(gold_bag: Set[str], predicted_bag: Set[str]) -> bool:
|
124 |
+
gold_numbers = set()
|
125 |
+
predicted_numbers = set()
|
126 |
+
for word in gold_bag:
|
127 |
+
if _is_number(word):
|
128 |
+
gold_numbers.add(word)
|
129 |
+
for word in predicted_bag:
|
130 |
+
if _is_number(word):
|
131 |
+
predicted_numbers.add(word)
|
132 |
+
if (not gold_numbers) or gold_numbers.intersection(predicted_numbers):
|
133 |
+
return True
|
134 |
+
return False
|
135 |
+
|
136 |
+
|
137 |
+
def get_metrics(
|
138 |
+
predicted: Union[str, List[str], Tuple[str, ...]],
|
139 |
+
gold: Union[str, List[str], Tuple[str, ...]],
|
140 |
+
) -> Tuple[float, float]:
|
141 |
+
"""
|
142 |
+
Takes a predicted answer and a gold answer (that are both either a string or a list of
|
143 |
+
strings), and returns exact match and the DROP F1 metric for the prediction. If you are
|
144 |
+
writing a script for evaluating objects in memory (say, the output of predictions during
|
145 |
+
validation, or while training), this is the function you want to call, after using
|
146 |
+
:func:`answer_json_to_strings` when reading the gold answer from the released data file.
|
147 |
+
"""
|
148 |
+
predicted_bags = _answer_to_bags(predicted)
|
149 |
+
gold_bags = _answer_to_bags(gold)
|
150 |
+
|
151 |
+
if set(predicted_bags[0]) == set(gold_bags[0]) and len(predicted_bags[0]) == len(gold_bags[0]):
|
152 |
+
exact_match = 1.0
|
153 |
+
else:
|
154 |
+
exact_match = 0.0
|
155 |
+
|
156 |
+
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
|
157 |
+
f1 = np.mean(f1_per_bag)
|
158 |
+
f1 = round(f1, 2)
|
159 |
+
return exact_match, f1
|
160 |
+
|
161 |
+
|
162 |
+
def evaluate_strings(prediction, gold):
|
163 |
+
if type(prediction) != list and type(prediction) != str:
|
164 |
+
prediction = str(prediction)
|
165 |
+
if type(gold) != list and type(gold) != str:
|
166 |
+
gold = str(gold)
|
167 |
+
try:
|
168 |
+
predicted_bags = _answer_to_bags(prediction)
|
169 |
+
gold_bags = _answer_to_bags(gold)
|
170 |
+
f1_per_bag = _align_bags(predicted_bags[1], gold_bags[1])
|
171 |
+
f1 = np.mean(f1_per_bag)
|
172 |
+
except Exception:
|
173 |
+
f1 = 0.0
|
174 |
+
return f1
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluate_utils/utils.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Callable, List, Set
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from scipy.optimize import linear_sum_assignment
|
5 |
+
|
6 |
+
|
7 |
+
def _align_bags(
|
8 |
+
predicted: List[Set[str]],
|
9 |
+
gold: List[Set[str]],
|
10 |
+
method: Callable[[object, object], float],
|
11 |
+
) -> List[float]:
|
12 |
+
"""
|
13 |
+
Takes gold and predicted answer sets and first finds the optimal 1-1 alignment
|
14 |
+
between them and gets maximum metric values over all the answers.
|
15 |
+
"""
|
16 |
+
scores = np.zeros([len(gold), len(predicted)])
|
17 |
+
for gold_index, gold_item in enumerate(gold):
|
18 |
+
for pred_index, pred_item in enumerate(predicted):
|
19 |
+
scores[gold_index, pred_index] = method(pred_item, gold_item)
|
20 |
+
row_ind, col_ind = linear_sum_assignment(-scores)
|
21 |
+
|
22 |
+
max_scores = np.zeros([max(len(gold), len(predicted))])
|
23 |
+
for row, column in zip(row_ind, col_ind):
|
24 |
+
max_scores[row] = max(max_scores[row], scores[row, column])
|
25 |
+
return max_scores
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/evaluation/evaluator.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# todo export evaluation to a python package
|
2 |
+
|
3 |
+
import json
|
4 |
+
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
from .evaluate_utils.evaluate_factory import get_evaluator
|
8 |
+
|
9 |
+
|
10 |
+
def find_isnan(samp):
|
11 |
+
try:
|
12 |
+
if np.isnan(samp):
|
13 |
+
return True
|
14 |
+
else:
|
15 |
+
return False
|
16 |
+
except:
|
17 |
+
return False
|
18 |
+
|
19 |
+
|
20 |
+
def fix_ans(answer):
|
21 |
+
try:
|
22 |
+
answer = (
|
23 |
+
answer.replace("{'", '{"')
|
24 |
+
.replace("', '", '", "')
|
25 |
+
.replace("': '", '": "')
|
26 |
+
.replace("'}", '"}')
|
27 |
+
)
|
28 |
+
answer = answer.replace("': ", '": ')
|
29 |
+
return answer
|
30 |
+
except:
|
31 |
+
return answer
|
32 |
+
|
33 |
+
|
34 |
+
def parse_answer(answer):
|
35 |
+
if len(answer) == 1:
|
36 |
+
ans, is_num = fix_number(answer[0])
|
37 |
+
if is_num:
|
38 |
+
return ans, "number"
|
39 |
+
try:
|
40 |
+
ans = json.loads(fix_ans(answer[0]))
|
41 |
+
return [ans], "json"
|
42 |
+
except:
|
43 |
+
ans, is_num = fix_number(answer[0])
|
44 |
+
if is_num:
|
45 |
+
return ans, "number"
|
46 |
+
else:
|
47 |
+
return answer[0], "string"
|
48 |
+
else:
|
49 |
+
try:
|
50 |
+
ans = [json.loads(fix_ans(ex)) for ex in answer]
|
51 |
+
return ans, "json"
|
52 |
+
except:
|
53 |
+
return answer, "string list"
|
54 |
+
|
55 |
+
|
56 |
+
def fix_number(number):
|
57 |
+
if type(number) == str:
|
58 |
+
copy_ans = number
|
59 |
+
copy_ans = " ".join(
|
60 |
+
" ".join(" ".join(copy_ans.split("$")).split("%")).split("sqft")
|
61 |
+
).strip()
|
62 |
+
copy_ans = copy_ans.strip()
|
63 |
+
copy_ans = copy_ans.replace(",", ".").replace(" square kilometers", "")
|
64 |
+
try:
|
65 |
+
return float(copy_ans), True
|
66 |
+
except:
|
67 |
+
return number, False
|
68 |
+
elif type(number) == int:
|
69 |
+
return float(number), True
|
70 |
+
else:
|
71 |
+
return number, True
|
72 |
+
|
73 |
+
|
74 |
+
def fix_prediction(prediction, gold_answer, evaluator):
|
75 |
+
if (
|
76 |
+
type(prediction) == list
|
77 |
+
and len(prediction) == 1
|
78 |
+
and (
|
79 |
+
type(prediction[0]) == int
|
80 |
+
or ((type(prediction[0]) == str) and prediction[0].isnumeric())
|
81 |
+
)
|
82 |
+
):
|
83 |
+
prediction = fix_number(prediction[0])
|
84 |
+
|
85 |
+
if type(prediction) != list:
|
86 |
+
prediction, is_num = fix_number(prediction)
|
87 |
+
if evaluator == "json":
|
88 |
+
try:
|
89 |
+
prediction = [json.loads(pred) for pred in prediction.split("\n")]
|
90 |
+
except:
|
91 |
+
prediction = [prediction]
|
92 |
+
|
93 |
+
if (hasattr(type(prediction), "__len__")) and (len(prediction) == 0):
|
94 |
+
return prediction, False
|
95 |
+
|
96 |
+
if (type(prediction) == list and len(prediction) > 1) and type(gold_answer) == float:
|
97 |
+
return prediction, False
|
98 |
+
|
99 |
+
return prediction, True
|
100 |
+
|
101 |
+
|
102 |
+
def question_scorer(prediction, gold_answer):
|
103 |
+
try:
|
104 |
+
prediction = json.loads(prediction)
|
105 |
+
except:
|
106 |
+
prediction = prediction
|
107 |
+
|
108 |
+
answer_list = (
|
109 |
+
[x for x in gold_answer.split("\n") if len(x.strip()) > 0]
|
110 |
+
if type(gold_answer) != list
|
111 |
+
else gold_answer
|
112 |
+
)
|
113 |
+
gold_answer, evaluator = parse_answer(answer_list)
|
114 |
+
prediction, run_eval = fix_prediction(prediction, gold_answer, evaluator)
|
115 |
+
|
116 |
+
has_ans = 1.0
|
117 |
+
if (type(prediction) != float and len(prediction) == 0) or find_isnan(prediction):
|
118 |
+
has_ans = 0.0
|
119 |
+
|
120 |
+
if type(prediction) == list:
|
121 |
+
if all(
|
122 |
+
(type(pred) not in {float, int} and len(pred) == 0) or find_isnan(pred)
|
123 |
+
for pred in prediction
|
124 |
+
):
|
125 |
+
has_ans = 0
|
126 |
+
|
127 |
+
if not run_eval:
|
128 |
+
return 0.0, has_ans
|
129 |
+
|
130 |
+
metric_eval = get_evaluator(evaluator)
|
131 |
+
accuracy = metric_eval(prediction, gold_answer)
|
132 |
+
return accuracy, has_ans
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/task.py
ADDED
@@ -0,0 +1,142 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import os
|
3 |
+
from typing import Dict, Tuple
|
4 |
+
|
5 |
+
from datasets import load_dataset
|
6 |
+
from playwright.sync_api import Page
|
7 |
+
|
8 |
+
from browsergym.core.task import AbstractBrowserTask
|
9 |
+
|
10 |
+
from .evaluation.evaluator import question_scorer
|
11 |
+
from .utils import add_prediction_to_jsonl
|
12 |
+
|
13 |
+
logger = logging.getLogger(__name__)
|
14 |
+
|
15 |
+
_DEFAULT_OUTPUT_FILE = None
|
16 |
+
|
17 |
+
|
18 |
+
def set_default_output_file(output_file: str):
|
19 |
+
global _DEFAULT_OUTPUT_FILE
|
20 |
+
_DEFAULT_OUTPUT_FILE = output_file
|
21 |
+
|
22 |
+
|
23 |
+
def get_default_output_file():
|
24 |
+
return _DEFAULT_OUTPUT_FILE
|
25 |
+
|
26 |
+
|
27 |
+
# Load dataset
|
28 |
+
|
29 |
+
DATA_DATASET = "AssistantBench/AssistantBench"
|
30 |
+
all_tasks = load_dataset(DATA_DATASET, trust_remote_code=True)
|
31 |
+
|
32 |
+
|
33 |
+
# Extract answers and tasks for validation and test splits
|
34 |
+
def extract_data(split_name: str) -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
|
35 |
+
return (
|
36 |
+
{
|
37 |
+
f"{split_name}.{i}": row["answer"] if row["answer"] is not None else ""
|
38 |
+
for i, row in enumerate(all_tasks[split_name])
|
39 |
+
},
|
40 |
+
{f"{split_name}.{i}": row["task"] for i, row in enumerate(all_tasks[split_name])},
|
41 |
+
{f"{split_name}.{i}": row["id"] for i, row in enumerate(all_tasks[split_name])},
|
42 |
+
)
|
43 |
+
|
44 |
+
|
45 |
+
# Implementation data for testing
|
46 |
+
def get_implementation_testing_data() -> Tuple[Dict[str, str], Dict[str, str], Dict[str, str]]:
|
47 |
+
return (
|
48 |
+
{"imp.0": "20"},
|
49 |
+
{
|
50 |
+
"imp.0": "What is the weather in Paris yesterday in Celsius? Answer with the number only."
|
51 |
+
},
|
52 |
+
{"imp.0": "test_imp_id_0"},
|
53 |
+
)
|
54 |
+
|
55 |
+
|
56 |
+
# Combine dev, test, and implementation-specific testing splits
|
57 |
+
gold_answers_dev, tasks_dev, ids_dev = extract_data("validation")
|
58 |
+
gold_answers_test, tasks_test, ids_test = extract_data("test")
|
59 |
+
gold_answers_impl_testing, tasks_test_impl_testing, ids_imp_testing = (
|
60 |
+
get_implementation_testing_data()
|
61 |
+
)
|
62 |
+
gold_answers = {**gold_answers_dev, **gold_answers_test, **gold_answers_impl_testing}
|
63 |
+
tasks = {**tasks_dev, **tasks_test, **tasks_test_impl_testing}
|
64 |
+
ids = {**ids_dev, **ids_test, **ids_imp_testing}
|
65 |
+
|
66 |
+
|
67 |
+
class AssistantBenchTask(AbstractBrowserTask):
|
68 |
+
|
69 |
+
@classmethod
|
70 |
+
def get_task_id(cls) -> str:
|
71 |
+
"""
|
72 |
+
Generic class for several task ids, this way of obtaining the task id is not compatible for now.
|
73 |
+
"""
|
74 |
+
raise NotImplementedError
|
75 |
+
|
76 |
+
def __init__(
|
77 |
+
self, seed: int, task_id: str, output_file: str = None, save_predictions: bool = False
|
78 |
+
) -> None:
|
79 |
+
"""
|
80 |
+
Args:
|
81 |
+
seed (int): Random seed for task initialization.
|
82 |
+
task_id (str): Unique identifier for the task (for the BrowserGym environment).
|
83 |
+
output_file (str, optional): Path to the output file for saving results, needed for test set.
|
84 |
+
save_predictions (bool, optional): Save predictions to the output file (yes/no).
|
85 |
+
"""
|
86 |
+
super().__init__(seed)
|
87 |
+
self.locale = "en-US"
|
88 |
+
self.timezone_id = "America/New_York"
|
89 |
+
|
90 |
+
self.task_id = task_id
|
91 |
+
self.start_url = "https://google.com"
|
92 |
+
self.goal = tasks[str(self.task_id)]
|
93 |
+
self.gold = gold_answers[str(self.task_id)]
|
94 |
+
self.ab_task_id = ids[self.task_id]
|
95 |
+
self.save_predictions = save_predictions
|
96 |
+
|
97 |
+
self.output_file = output_file
|
98 |
+
|
99 |
+
# set output_file using the global default value, if not provided in constructor
|
100 |
+
if not self.output_file:
|
101 |
+
self.output_file = get_default_output_file()
|
102 |
+
# use env variable in last resort
|
103 |
+
if not self.output_file:
|
104 |
+
self.output_file = os.getenv("ASSISTANTBENCH_OUTPUT_FILE", None)
|
105 |
+
|
106 |
+
if self.save_predictions and self.output_file:
|
107 |
+
logger.info(f"Task prediction will be written to output file {self.output_file}")
|
108 |
+
|
109 |
+
def setup(self, page: Page) -> Tuple[str, dict]:
|
110 |
+
logger.info(f"Navigating to start url: {self.start_url}")
|
111 |
+
page.goto(self.start_url, timeout=50000)
|
112 |
+
if self.save_predictions and self.output_file:
|
113 |
+
# create an empty task entry in the output file (will raise an Exception if the entry is already there)
|
114 |
+
add_prediction_to_jsonl(
|
115 |
+
file_path=self.output_file,
|
116 |
+
task_id=self.ab_task_id,
|
117 |
+
prediction="",
|
118 |
+
override_if_exists=False,
|
119 |
+
)
|
120 |
+
return self.goal, {}
|
121 |
+
|
122 |
+
def teardown(self) -> None:
|
123 |
+
pass
|
124 |
+
|
125 |
+
def validate(self, page: Page, chat_messages: list[dict]) -> Tuple[float, bool, str, dict]:
|
126 |
+
accuracy, done, msg, info = 0.0, False, "", {}
|
127 |
+
|
128 |
+
# eval when the agent returns a response
|
129 |
+
if chat_messages and chat_messages[-1]["role"] == "assistant":
|
130 |
+
done = True
|
131 |
+
prediction = chat_messages[-1]["message"]
|
132 |
+
if self.save_predictions and self.output_file:
|
133 |
+
# update the task entry in the output file
|
134 |
+
add_prediction_to_jsonl(
|
135 |
+
file_path=self.output_file,
|
136 |
+
task_id=self.ab_task_id,
|
137 |
+
prediction=prediction,
|
138 |
+
override_if_exists=True,
|
139 |
+
)
|
140 |
+
accuracy, has_ans = question_scorer(prediction, self.gold)
|
141 |
+
|
142 |
+
return accuracy, done, msg, info
|
BrowserGym/browsergym/assistantbench/src/browsergym/assistantbench/utils.py
ADDED
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import logging
|
3 |
+
import os
|
4 |
+
import pathlib
|
5 |
+
import time
|
6 |
+
|
7 |
+
logger = logging.getLogger(__name__)
|
8 |
+
|
9 |
+
|
10 |
+
def add_prediction_to_jsonl(
|
11 |
+
file_path: str, task_id: str, prediction: object, override_if_exists: bool
|
12 |
+
) -> None:
|
13 |
+
"""
|
14 |
+
Multiprocessing-safe file write.
|
15 |
+
"""
|
16 |
+
lock_file_path = pathlib.Path(file_path).with_suffix(".lock")
|
17 |
+
lock_max_wait = 10 # 10 seconds
|
18 |
+
|
19 |
+
# Acquire lock (atomic file creation)
|
20 |
+
start_time = time.time()
|
21 |
+
while True:
|
22 |
+
try:
|
23 |
+
fd = os.open(lock_file_path, os.O_CREAT | os.O_EXCL | os.O_WRONLY)
|
24 |
+
with os.fdopen(fd, "w") as f:
|
25 |
+
f.write("lock")
|
26 |
+
break
|
27 |
+
except FileExistsError:
|
28 |
+
# give up if max wait time reached
|
29 |
+
seconds_waited = time.time() - start_time
|
30 |
+
if seconds_waited >= lock_max_wait:
|
31 |
+
raise RuntimeError(
|
32 |
+
f"Lock file could not be acquired after {seconds_waited} seconds ({lock_file_path})"
|
33 |
+
)
|
34 |
+
# wait for lock release
|
35 |
+
logger.info(f"Waiting for lock file to be released: {lock_file_path}")
|
36 |
+
time.sleep(1) # 1 sec
|
37 |
+
|
38 |
+
logger.info(f"Lock file acquired: {lock_file_path}")
|
39 |
+
|
40 |
+
# Check if the file exists, if not, create it
|
41 |
+
if not os.path.exists(file_path):
|
42 |
+
with open(file_path, "w") as f:
|
43 |
+
pass # Create an empty file
|
44 |
+
|
45 |
+
# Load existing data, if any
|
46 |
+
data = []
|
47 |
+
if os.path.exists(file_path):
|
48 |
+
with open(file_path, "r") as f:
|
49 |
+
data.extend([json.loads(line) for line in f if line.strip()]) # Skip empty lines
|
50 |
+
|
51 |
+
# Check if task_id already exists
|
52 |
+
existing_record = next((entry for entry in data if entry["id"] == task_id), None)
|
53 |
+
|
54 |
+
# Add or update the record
|
55 |
+
if not existing_record:
|
56 |
+
# Add new record
|
57 |
+
data.append({"id": task_id, "answer": prediction})
|
58 |
+
elif override_if_exists:
|
59 |
+
# Update existing record
|
60 |
+
existing_record["answer"] = prediction
|
61 |
+
else:
|
62 |
+
raise ValueError(
|
63 |
+
f"Prediction for task ID {repr(task_id)} already exists in file {file_path}."
|
64 |
+
)
|
65 |
+
|
66 |
+
# Write data back to the file
|
67 |
+
with open(file_path, "w") as f:
|
68 |
+
for entry in data:
|
69 |
+
f.write(json.dumps(entry) + "\n")
|
70 |
+
|
71 |
+
# Release lock (remove file)
|
72 |
+
os.remove(lock_file_path)
|
73 |
+
logger.info(f"Lock file released: {lock_file_path}")
|
BrowserGym/browsergym/browsergym.egg-info/PKG-INFO
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Metadata-Version: 2.4
|
2 |
+
Name: browsergym
|
3 |
+
Version: 0.13.4
|
4 |
+
Summary: BrowserGym: a gym environment for web task automation in the Chromium browser
|
5 |
+
Author: Rim Assouel, Léo Boisvert, Massimo Caccia, Alex Drouin, Maxime Gasse, Imene Kerboua, Alex Lacoste, Thibault Le Sellier De Chezelles, Tom Marty, Aman Jaiswal
|
6 |
+
License: Apache-2.0
|
7 |
+
Classifier: Development Status :: 3 - Alpha
|
8 |
+
Classifier: Programming Language :: Python :: 3
|
9 |
+
Classifier: Operating System :: OS Independent
|
10 |
+
Classifier: Intended Audience :: Science/Research
|
11 |
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
12 |
+
Classifier: License :: OSI Approved :: Apache Software License
|
13 |
+
Requires-Python: >3.10
|
14 |
+
Description-Content-Type: text/markdown
|
15 |
+
Requires-Dist: browsergym-core==0.13.4
|
16 |
+
Requires-Dist: browsergym-miniwob==0.13.4
|
17 |
+
Requires-Dist: browsergym-webarena==0.13.4
|
18 |
+
Requires-Dist: browsergym-visualwebarena==0.13.4
|
19 |
+
Requires-Dist: browsergym-assistantbench==0.13.4
|
20 |
+
Requires-Dist: browsergym-experiments==0.13.4
|
21 |
+
Requires-Dist: browsergym-workarena>=0.4.1
|
22 |
+
Requires-Dist: weblinx-browsergym>=0.0.2
|
BrowserGym/browsergym/browsergym.egg-info/SOURCES.txt
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
pyproject.toml
|
2 |
+
browsergym.egg-info/PKG-INFO
|
3 |
+
browsergym.egg-info/SOURCES.txt
|
4 |
+
browsergym.egg-info/dependency_links.txt
|
5 |
+
browsergym.egg-info/requires.txt
|
6 |
+
browsergym.egg-info/top_level.txt
|
BrowserGym/browsergym/browsergym.egg-info/dependency_links.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
BrowserGym/browsergym/browsergym.egg-info/requires.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
browsergym-core==0.13.4
|
2 |
+
browsergym-miniwob==0.13.4
|
3 |
+
browsergym-webarena==0.13.4
|
4 |
+
browsergym-visualwebarena==0.13.4
|
5 |
+
browsergym-assistantbench==0.13.4
|
6 |
+
browsergym-experiments==0.13.4
|
7 |
+
browsergym-workarena>=0.4.1
|
8 |
+
weblinx-browsergym>=0.0.2
|
BrowserGym/browsergym/browsergym.egg-info/top_level.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
|
BrowserGym/browsergym/core/README.md
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# BrowserGym core
|
2 |
+
|
3 |
+
This package provides `browsergym.core`, which provides the core functionalities of [BrowserGym](https://github.com/ServiceNow/BrowserGym).
|
4 |
+
|
5 |
+
## Setup
|
6 |
+
|
7 |
+
1. Install the package
|
8 |
+
```sh
|
9 |
+
pip install browsergym-core
|
10 |
+
```
|
BrowserGym/browsergym/core/pyproject.toml
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
[build-system]
|
2 |
+
requires = ["hatchling", "hatch-requirements-txt"]
|
3 |
+
build-backend = "hatchling.build"
|
4 |
+
|
5 |
+
[project]
|
6 |
+
name = "browsergym-core"
|
7 |
+
description = "BrowserGym: a gym environment for web task automation in the Chromium browser"
|
8 |
+
authors = [
|
9 |
+
{name = "Rim Assouel"},
|
10 |
+
{name = "Léo Boisvert"},
|
11 |
+
{name = "Massimo Caccia"},
|
12 |
+
{name = "Alex Drouin"},
|
13 |
+
{name = "Maxime Gasse"},
|
14 |
+
{name = "Imene Kerboua"},
|
15 |
+
{name = "Alex Lacoste"},
|
16 |
+
{name = "Thibault Le Sellier De Chezelles"},
|
17 |
+
{name = "Tom Marty"},
|
18 |
+
]
|
19 |
+
readme = "README.md"
|
20 |
+
requires-python = ">3.9"
|
21 |
+
license = {text = "Apache-2.0"}
|
22 |
+
classifiers = [
|
23 |
+
"Development Status :: 3 - Alpha",
|
24 |
+
"Programming Language :: Python :: 3",
|
25 |
+
"Operating System :: OS Independent",
|
26 |
+
"Intended Audience :: Science/Research",
|
27 |
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
28 |
+
"License :: OSI Approved :: Apache Software License",
|
29 |
+
]
|
30 |
+
dynamic = ["dependencies", "version"]
|
31 |
+
|
32 |
+
[project.urls]
|
33 |
+
homepage = "https://github.com/ServiceNow/BrowserGym"
|
34 |
+
|
35 |
+
[tool.hatch.version]
|
36 |
+
path = "src/browsergym/core/__init__.py"
|
37 |
+
|
38 |
+
[tool.hatch.metadata.hooks.requirements_txt]
|
39 |
+
files = ["requirements.txt"]
|
40 |
+
|
41 |
+
[tool.hatch.build.targets.wheel]
|
42 |
+
packages = ["src/browsergym"]
|
BrowserGym/browsergym/core/requirements.txt
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
playwright==1.44
|
2 |
+
gymnasium>=0.27
|
3 |
+
numpy>=1.14
|
4 |
+
pyparsing>=3
|
5 |
+
Pillow>=10.1
|
6 |
+
beautifulsoup4>=4.12
|
7 |
+
lxml>=4.9
|
8 |
+
mcp[cli]>=1.6.0
|
BrowserGym/browsergym/core/src/browsergym/core/__init__.py
ADDED
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
__version__ = "0.13.4"
|
2 |
+
|
3 |
+
import playwright.sync_api
|
4 |
+
|
5 |
+
# we use a global playwright instance
|
6 |
+
_PLAYWRIGHT = None
|
7 |
+
|
8 |
+
|
9 |
+
def _set_global_playwright(pw: playwright.sync_api.Playwright):
|
10 |
+
global _PLAYWRIGHT
|
11 |
+
_PLAYWRIGHT = pw
|
12 |
+
|
13 |
+
|
14 |
+
def _get_global_playwright():
|
15 |
+
global _PLAYWRIGHT
|
16 |
+
if not _PLAYWRIGHT:
|
17 |
+
pw = playwright.sync_api.sync_playwright().start()
|
18 |
+
_set_global_playwright(pw)
|
19 |
+
|
20 |
+
return _PLAYWRIGHT
|
21 |
+
|
22 |
+
|
23 |
+
# register the open-ended task
|
24 |
+
from .registration import register_task
|
25 |
+
from .task import OpenEndedTask
|
26 |
+
|
27 |
+
register_task(OpenEndedTask.get_task_id(), OpenEndedTask)
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (1.14 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/chat.cpython-311.pyc
ADDED
Binary file (6.89 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/constants.cpython-311.pyc
ADDED
Binary file (428 Bytes). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/env.cpython-311.pyc
ADDED
Binary file (31.2 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/observation.cpython-311.pyc
ADDED
Binary file (22.7 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/registration.cpython-311.pyc
ADDED
Binary file (3.49 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/spaces.cpython-311.pyc
ADDED
Binary file (8.42 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/__pycache__/task.cpython-311.pyc
ADDED
Binary file (5.53 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__init__.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
_DEMO_MODE = False
|
2 |
+
|
3 |
+
|
4 |
+
def set_global_demo_mode(demo_mode: bool):
|
5 |
+
global _DEMO_MODE
|
6 |
+
_DEMO_MODE = demo_mode
|
7 |
+
|
8 |
+
|
9 |
+
def get_global_demo_mode():
|
10 |
+
global _DEMO_MODE
|
11 |
+
return _DEMO_MODE
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (561 Bytes). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/base.cpython-311.pyc
ADDED
Binary file (3.12 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/functions.cpython-311.pyc
ADDED
Binary file (26.2 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/highlevel.cpython-311.pyc
ADDED
Binary file (12.4 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/parsers.cpython-311.pyc
ADDED
Binary file (6.82 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/__pycache__/utils.cpython-311.pyc
ADDED
Binary file (12.2 kB). View file
|
|
BrowserGym/browsergym/core/src/browsergym/core/action/base.py
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import ABC, abstractmethod
|
2 |
+
|
3 |
+
import playwright.sync_api
|
4 |
+
|
5 |
+
from . import get_global_demo_mode
|
6 |
+
|
7 |
+
|
8 |
+
class AbstractActionSet(ABC):
|
9 |
+
def __init__(self, strict: bool = False):
|
10 |
+
self.strict = strict
|
11 |
+
|
12 |
+
@abstractmethod
|
13 |
+
def describe(self, with_long_description: bool = True, with_examples: bool = True) -> str:
|
14 |
+
"""
|
15 |
+
Returns a textual description of this action space.
|
16 |
+
"""
|
17 |
+
|
18 |
+
@abstractmethod
|
19 |
+
def example_action(self, abstract: bool) -> str:
|
20 |
+
"""
|
21 |
+
Returns an example action as a string.
|
22 |
+
"""
|
23 |
+
|
24 |
+
@abstractmethod
|
25 |
+
def to_python_code(self, action) -> str:
|
26 |
+
"""
|
27 |
+
Converts the given action to browsergym-compatible python code.
|
28 |
+
|
29 |
+
Args:
|
30 |
+
action: the action to convert.
|
31 |
+
|
32 |
+
Returns:
|
33 |
+
Executable python code that performs the action in a browsergym environment.
|
34 |
+
"""
|
35 |
+
|
36 |
+
|
37 |
+
def execute_python_code(
|
38 |
+
code: str,
|
39 |
+
page: playwright.sync_api.Page,
|
40 |
+
send_message_to_user: callable,
|
41 |
+
report_infeasible_instructions: callable,
|
42 |
+
):
|
43 |
+
"""
|
44 |
+
Executes Python code in a new context, except for a playwright `page` object and a `send_message_to_user` function.
|
45 |
+
|
46 |
+
WARNING: this is not safe!
|
47 |
+
https://stackoverflow.com/questions/77655440/can-you-protect-a-python-variable-with-exec
|
48 |
+
|
49 |
+
Args:
|
50 |
+
code: the Python code to execute, as a string.
|
51 |
+
page: the playwright page that will be made accessible to the code.
|
52 |
+
send_message_to_user: utility function that will be made accessible to the code. It should take one text argument.
|
53 |
+
report_infeasible_instructions: utility function that will be made accessible to the code. It should take one text argument.
|
54 |
+
"""
|
55 |
+
|
56 |
+
globals = {
|
57 |
+
"page": page,
|
58 |
+
"send_message_to_user": send_message_to_user,
|
59 |
+
"report_infeasible_instructions": report_infeasible_instructions,
|
60 |
+
"DEMO_MODE": get_global_demo_mode(),
|
61 |
+
}
|
62 |
+
|
63 |
+
exec(code, globals)
|
BrowserGym/browsergym/core/src/browsergym/core/action/functions.py
ADDED
@@ -0,0 +1,624 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# these are placeholders
|
2 |
+
# all these symbols will be available in browsergym actions
|
3 |
+
from typing import Literal
|
4 |
+
|
5 |
+
import playwright.sync_api
|
6 |
+
|
7 |
+
from .utils import (
|
8 |
+
add_demo_mode_effects,
|
9 |
+
call_fun,
|
10 |
+
get_elem_by_bid,
|
11 |
+
highlight_by_box,
|
12 |
+
smooth_move_visual_cursor_to,
|
13 |
+
)
|
14 |
+
|
15 |
+
page: playwright.sync_api.Page = None
|
16 |
+
send_message_to_user: callable = None
|
17 |
+
report_infeasible_instructions: callable = None
|
18 |
+
demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"] = None
|
19 |
+
retry_with_force: bool = False
|
20 |
+
|
21 |
+
"""IMPORTANT
|
22 |
+
The following primitives are meant to be included in the browsergym action using
|
23 |
+
inspect.getsource().
|
24 |
+
"""
|
25 |
+
|
26 |
+
|
27 |
+
def send_msg_to_user(text: str):
|
28 |
+
"""
|
29 |
+
Sends a message to the user.
|
30 |
+
|
31 |
+
Examples:
|
32 |
+
send_msg_to_user("Based on the results of my search, the city was built in 1751.")
|
33 |
+
"""
|
34 |
+
send_message_to_user(text)
|
35 |
+
|
36 |
+
|
37 |
+
def report_infeasible(reason: str):
|
38 |
+
"""
|
39 |
+
Notifies the user that their instructions are infeasible.
|
40 |
+
|
41 |
+
Examples:
|
42 |
+
report_infeasible("I cannot follow these instructions because there is no email field in this form.")
|
43 |
+
"""
|
44 |
+
report_infeasible_instructions(reason)
|
45 |
+
|
46 |
+
|
47 |
+
def noop(wait_ms: float = 1000):
|
48 |
+
"""
|
49 |
+
Do nothing, and optionally wait for the given time (in milliseconds).
|
50 |
+
|
51 |
+
Examples:
|
52 |
+
noop()
|
53 |
+
noop(500)
|
54 |
+
"""
|
55 |
+
page.wait_for_timeout(wait_ms)
|
56 |
+
|
57 |
+
|
58 |
+
# https://playwright.dev/docs/input#text-input
|
59 |
+
def fill(bid: str, value: str):
|
60 |
+
"""
|
61 |
+
Fill out a form field. It focuses the element and triggers an input event with the entered text.
|
62 |
+
It works for <input>, <textarea> and [contenteditable] elements.
|
63 |
+
|
64 |
+
Examples:
|
65 |
+
fill('237', 'example value')
|
66 |
+
fill('45', "multi-line\\nexample")
|
67 |
+
fill('a12', "example with \\"quotes\\"")
|
68 |
+
"""
|
69 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
70 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
|
71 |
+
|
72 |
+
def do(force: bool):
|
73 |
+
if demo_mode != "off":
|
74 |
+
delay = max(2000 / len(value), 10)
|
75 |
+
elem.clear(force=force, timeout=500)
|
76 |
+
elem.type(value, delay=delay, timeout=0) # no timeout
|
77 |
+
else:
|
78 |
+
elem.fill(value, force=force, timeout=500)
|
79 |
+
|
80 |
+
call_fun(do, retry_with_force)
|
81 |
+
|
82 |
+
|
83 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-check
|
84 |
+
def check(bid: str):
|
85 |
+
"""
|
86 |
+
Ensure a checkbox or radio element is checked.
|
87 |
+
|
88 |
+
Examples:
|
89 |
+
check('55')
|
90 |
+
"""
|
91 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
92 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
|
93 |
+
|
94 |
+
def do(force: bool):
|
95 |
+
elem.check(force=force, timeout=500)
|
96 |
+
|
97 |
+
call_fun(do, retry_with_force)
|
98 |
+
|
99 |
+
|
100 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-uncheck
|
101 |
+
def uncheck(bid: str):
|
102 |
+
"""
|
103 |
+
Ensure a checkbox or radio element is unchecked.
|
104 |
+
|
105 |
+
Examples:
|
106 |
+
uncheck('a5289')
|
107 |
+
"""
|
108 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
109 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
|
110 |
+
|
111 |
+
def do(force: bool):
|
112 |
+
elem.uncheck(force=force, timeout=500)
|
113 |
+
|
114 |
+
call_fun(do, retry_with_force)
|
115 |
+
|
116 |
+
|
117 |
+
# https://playwright.dev/docs/input#select-options
|
118 |
+
def select_option(bid: str, options: str | list[str]):
|
119 |
+
"""
|
120 |
+
Select one or multiple options in a <select> element. You can specify
|
121 |
+
option value or label to select. Multiple options can be selected.
|
122 |
+
|
123 |
+
Examples:
|
124 |
+
select_option('a48', "blue")
|
125 |
+
select_option('c48', ["red", "green", "blue"])
|
126 |
+
"""
|
127 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
128 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
|
129 |
+
|
130 |
+
def do(force: bool):
|
131 |
+
elem.select_option(options, force=force, timeout=500)
|
132 |
+
|
133 |
+
call_fun(do, retry_with_force)
|
134 |
+
|
135 |
+
|
136 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-click
|
137 |
+
def click(
|
138 |
+
bid: str,
|
139 |
+
button: Literal["left", "middle", "right"] = "left",
|
140 |
+
modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
|
141 |
+
):
|
142 |
+
"""
|
143 |
+
Click an element.
|
144 |
+
|
145 |
+
Examples:
|
146 |
+
click('a51')
|
147 |
+
click('b22', button="right")
|
148 |
+
click('48', button="middle", modifiers=["Shift"])
|
149 |
+
"""
|
150 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
151 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
|
152 |
+
|
153 |
+
def do(force: bool):
|
154 |
+
elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
|
155 |
+
|
156 |
+
call_fun(do, retry_with_force)
|
157 |
+
|
158 |
+
|
159 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-dblclick
|
160 |
+
def dblclick(
|
161 |
+
bid: str,
|
162 |
+
button: Literal["left", "middle", "right"] = "left",
|
163 |
+
modifiers: list[Literal["Alt", "Control", "ControlOrMeta", "Meta", "Shift"]] = [],
|
164 |
+
):
|
165 |
+
"""
|
166 |
+
Double click an element.
|
167 |
+
|
168 |
+
Examples:
|
169 |
+
dblclick('12')
|
170 |
+
dblclick('ca42', button="right")
|
171 |
+
dblclick('178', button="middle", modifiers=["Shift"])
|
172 |
+
"""
|
173 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
174 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
|
175 |
+
|
176 |
+
def do(force: bool):
|
177 |
+
elem.click(button=button, modifiers=modifiers, force=force, timeout=500)
|
178 |
+
|
179 |
+
call_fun(do, retry_with_force)
|
180 |
+
|
181 |
+
|
182 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-hover
|
183 |
+
def hover(bid: str):
|
184 |
+
"""
|
185 |
+
Hover over an element.
|
186 |
+
|
187 |
+
Examples:
|
188 |
+
hover('b8')
|
189 |
+
"""
|
190 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
191 |
+
add_demo_mode_effects(
|
192 |
+
page, elem, bid, demo_mode=demo_mode, move_cursor=True, highlight_box=False
|
193 |
+
)
|
194 |
+
|
195 |
+
def do(force: bool):
|
196 |
+
elem.hover(force=force, timeout=500)
|
197 |
+
|
198 |
+
call_fun(do, retry_with_force)
|
199 |
+
|
200 |
+
|
201 |
+
# https://playwright.dev/python/docs/input#keys-and-shortcuts
|
202 |
+
def press(bid: str, key_comb: str):
|
203 |
+
"""
|
204 |
+
Focus the matching element and press a combination of keys. It accepts
|
205 |
+
the logical key names that are emitted in the keyboardEvent.key property
|
206 |
+
of the keyboard events: Backquote, Minus, Equal, Backslash, Backspace,
|
207 |
+
Tab, Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
|
208 |
+
ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
|
209 |
+
alternatively specify a single character you'd like to produce such as "a"
|
210 |
+
or "#". Following modification shortcuts are also supported: Shift, Control,
|
211 |
+
Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta resolves to Control on
|
212 |
+
Windows and Linux and to Meta on macOS.
|
213 |
+
|
214 |
+
Examples:
|
215 |
+
press('88', 'Backspace')
|
216 |
+
press('a26', 'ControlOrMeta+a')
|
217 |
+
press('a61', 'Meta+Shift+t')
|
218 |
+
"""
|
219 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
220 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
|
221 |
+
elem.press(key_comb, timeout=500)
|
222 |
+
|
223 |
+
|
224 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-focus
|
225 |
+
def focus(bid: str):
|
226 |
+
"""
|
227 |
+
Focus the matching element.
|
228 |
+
|
229 |
+
Examples:
|
230 |
+
focus('b455')
|
231 |
+
"""
|
232 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
233 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
|
234 |
+
elem.focus(timeout=500)
|
235 |
+
|
236 |
+
|
237 |
+
# https://playwright.dev/python/docs/api/class-locator#locator-clear
|
238 |
+
def clear(bid: str):
|
239 |
+
"""
|
240 |
+
Clear the input field.
|
241 |
+
|
242 |
+
Examples:
|
243 |
+
clear('996')
|
244 |
+
"""
|
245 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
246 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=False)
|
247 |
+
elem.clear(timeout=500)
|
248 |
+
|
249 |
+
|
250 |
+
# https://playwright.dev/python/docs/input#drag-and-drop
|
251 |
+
def drag_and_drop(from_bid: str, to_bid: str):
|
252 |
+
"""
|
253 |
+
Perform a drag & drop. Hover the element that will be dragged. Press
|
254 |
+
left mouse button. Move mouse to the element that will receive the
|
255 |
+
drop. Release left mouse button.
|
256 |
+
|
257 |
+
Examples:
|
258 |
+
drag_and_drop('56', '498')
|
259 |
+
"""
|
260 |
+
from_elem = get_elem_by_bid(page, from_bid, demo_mode != "off")
|
261 |
+
add_demo_mode_effects(page, from_elem, from_bid, demo_mode=demo_mode, move_cursor=True)
|
262 |
+
from_elem.hover(timeout=500)
|
263 |
+
page.mouse.down()
|
264 |
+
|
265 |
+
to_elem = get_elem_by_bid(page, to_bid, demo_mode != "off")
|
266 |
+
add_demo_mode_effects(page, to_elem, to_bid, demo_mode=demo_mode, move_cursor=True)
|
267 |
+
to_elem.hover(timeout=500)
|
268 |
+
page.mouse.up()
|
269 |
+
|
270 |
+
|
271 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-wheel
|
272 |
+
def scroll(delta_x: float, delta_y: float):
|
273 |
+
"""
|
274 |
+
Scroll horizontally and vertically. Amounts in pixels, positive for right or down scrolling, negative for left or up scrolling. Dispatches a wheel event.
|
275 |
+
|
276 |
+
Examples:
|
277 |
+
scroll(0, 200)
|
278 |
+
scroll(-50.2, -100.5)
|
279 |
+
"""
|
280 |
+
page.mouse.wheel(delta_x, delta_y)
|
281 |
+
|
282 |
+
|
283 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-move
|
284 |
+
def mouse_move(x: float, y: float):
|
285 |
+
"""
|
286 |
+
Move the mouse to a location. Uses absolute client coordinates in pixels.
|
287 |
+
Dispatches a mousemove event.
|
288 |
+
|
289 |
+
Examples:
|
290 |
+
mouse_move(65.2, 158.5)
|
291 |
+
"""
|
292 |
+
if demo_mode != "off":
|
293 |
+
smooth_move_visual_cursor_to(page, x, y)
|
294 |
+
page.mouse.move(x, y)
|
295 |
+
|
296 |
+
|
297 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-up
|
298 |
+
def mouse_up(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
|
299 |
+
"""
|
300 |
+
Move the mouse to a location then release a mouse button. Dispatches
|
301 |
+
mousemove and mouseup events.
|
302 |
+
|
303 |
+
Examples:
|
304 |
+
mouse_up(250, 120)
|
305 |
+
mouse_up(47, 252, 'right')
|
306 |
+
"""
|
307 |
+
if demo_mode != "off":
|
308 |
+
smooth_move_visual_cursor_to(page, x, y)
|
309 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
310 |
+
page.mouse.move(x, y)
|
311 |
+
page.mouse.up(button=button)
|
312 |
+
|
313 |
+
|
314 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-down
|
315 |
+
def mouse_down(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
|
316 |
+
"""
|
317 |
+
Move the mouse to a location then press and hold a mouse button. Dispatches
|
318 |
+
mousemove and mousedown events.
|
319 |
+
|
320 |
+
Examples:
|
321 |
+
mouse_down(140.2, 580.1)
|
322 |
+
mouse_down(458, 254.5, 'middle')
|
323 |
+
"""
|
324 |
+
if demo_mode != "off":
|
325 |
+
smooth_move_visual_cursor_to(page, x, y)
|
326 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
327 |
+
page.mouse.move(x, y)
|
328 |
+
page.mouse.down(button=button)
|
329 |
+
|
330 |
+
|
331 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-click
|
332 |
+
def mouse_click(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
|
333 |
+
"""
|
334 |
+
Move the mouse to a location and click a mouse button. Dispatches mousemove,
|
335 |
+
mousedown and mouseup events.
|
336 |
+
|
337 |
+
Examples:
|
338 |
+
mouse_click(887.2, 68)
|
339 |
+
mouse_click(56, 712.56, 'right')
|
340 |
+
"""
|
341 |
+
if demo_mode != "off":
|
342 |
+
smooth_move_visual_cursor_to(page, x, y)
|
343 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
344 |
+
page.mouse.click(x, y, button=button)
|
345 |
+
|
346 |
+
|
347 |
+
# https://playwright.dev/python/docs/api/class-mouse#mouse-dblclick
|
348 |
+
def mouse_dblclick(x: float, y: float, button: Literal["left", "middle", "right"] = "left"):
|
349 |
+
"""
|
350 |
+
Move the mouse to a location and double click a mouse button. Dispatches
|
351 |
+
mousemove, mousedown and mouseup events.
|
352 |
+
|
353 |
+
Examples:
|
354 |
+
mouse_dblclick(5, 236)
|
355 |
+
mouse_dblclick(87.5, 354, 'right')
|
356 |
+
"""
|
357 |
+
if demo_mode != "off":
|
358 |
+
smooth_move_visual_cursor_to(page, x, y)
|
359 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
360 |
+
page.mouse.dblclick(x, y, button=button)
|
361 |
+
|
362 |
+
|
363 |
+
def mouse_drag_and_drop(from_x: float, from_y: float, to_x: float, to_y: float):
|
364 |
+
"""
|
365 |
+
Drag and drop from a location to a location. Uses absolute client
|
366 |
+
coordinates in pixels. Dispatches mousemove, mousedown and mouseup
|
367 |
+
events.
|
368 |
+
|
369 |
+
Examples:
|
370 |
+
mouse_drag_and_drop(10.7, 325, 235.6, 24.54)
|
371 |
+
"""
|
372 |
+
if demo_mode != "off":
|
373 |
+
x, y = from_x, from_y
|
374 |
+
smooth_move_visual_cursor_to(page, x, y)
|
375 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
376 |
+
page.mouse.move(from_x, from_y)
|
377 |
+
page.mouse.down()
|
378 |
+
if demo_mode != "off":
|
379 |
+
x, y = to_x, to_y
|
380 |
+
smooth_move_visual_cursor_to(page, x, y)
|
381 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
382 |
+
page.mouse.move(to_x, to_y)
|
383 |
+
page.mouse.up()
|
384 |
+
|
385 |
+
|
386 |
+
# https://playwright.dev/python/docs/api/class-keyboard#keyboard-press
|
387 |
+
def keyboard_press(key: str):
|
388 |
+
"""
|
389 |
+
Press a combination of keys. Accepts the logical key names that are
|
390 |
+
emitted in the keyboardEvent.key property of the keyboard events:
|
391 |
+
Backquote, Minus, Equal, Backslash, Backspace, Tab, Delete, Escape,
|
392 |
+
ArrowDown, End, Enter, Home, Insert, PageDown, PageUp, ArrowRight,
|
393 |
+
ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
|
394 |
+
alternatively specify a single character you'd like to produce such
|
395 |
+
as "a" or "#". Following modification shortcuts are also supported:
|
396 |
+
Shift, Control, Alt, Meta, ShiftLeft, ControlOrMeta. ControlOrMeta
|
397 |
+
resolves to Control on Windows and Linux and to Meta on macOS.
|
398 |
+
|
399 |
+
Examples:
|
400 |
+
keyboard_press('Backspace')
|
401 |
+
keyboard_press('ControlOrMeta+a')
|
402 |
+
keyboard_press('Meta+Shift+t')
|
403 |
+
page.keyboard.press("PageDown")
|
404 |
+
"""
|
405 |
+
page.keyboard.press(key)
|
406 |
+
|
407 |
+
|
408 |
+
# https://playwright.dev/python/docs/api/class-keyboard#keyboard-up
|
409 |
+
def keyboard_up(key: str):
|
410 |
+
"""
|
411 |
+
Release a keyboard key. Dispatches a keyup event. Accepts the logical
|
412 |
+
key names that are emitted in the keyboardEvent.key property of the
|
413 |
+
keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
|
414 |
+
Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
|
415 |
+
ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc.
|
416 |
+
You can alternatively specify a single character you'd like to produce
|
417 |
+
such as "a" or "#".
|
418 |
+
|
419 |
+
Examples:
|
420 |
+
keyboard_up('Shift')
|
421 |
+
keyboard_up('c')
|
422 |
+
"""
|
423 |
+
page.keyboard.up(key)
|
424 |
+
|
425 |
+
|
426 |
+
# https://playwright.dev/python/docs/api/class-keyboard#keyboard-down
|
427 |
+
def keyboard_down(key: str):
|
428 |
+
"""
|
429 |
+
Press and holds a keyboard key. Dispatches a keydown event. Accepts the
|
430 |
+
logical key names that are emitted in the keyboardEvent.key property of
|
431 |
+
the keyboard events: Backquote, Minus, Equal, Backslash, Backspace, Tab,
|
432 |
+
Delete, Escape, ArrowDown, End, Enter, Home, Insert, PageDown, PageUp,
|
433 |
+
ArrowRight, ArrowUp, F1 - F12, Digit0 - Digit9, KeyA - KeyZ, etc. You can
|
434 |
+
alternatively specify a single character such as "a" or "#".
|
435 |
+
|
436 |
+
Examples:
|
437 |
+
keyboard_up('Shift')
|
438 |
+
keyboard_up('c')
|
439 |
+
"""
|
440 |
+
page.keyboard.down(key)
|
441 |
+
|
442 |
+
|
443 |
+
# https://playwright.dev/python/docs/api/class-keyboard#keyboard-type
|
444 |
+
def keyboard_type(text: str):
|
445 |
+
"""
|
446 |
+
Types a string of text through the keyboard. Sends a keydown, keypress/input,
|
447 |
+
and keyup event for each character in the text. Modifier keys DO NOT affect
|
448 |
+
keyboard_type. Holding down Shift will not type the text in upper case.
|
449 |
+
|
450 |
+
Examples:
|
451 |
+
keyboard_type('Hello world!')
|
452 |
+
"""
|
453 |
+
if demo_mode != "off":
|
454 |
+
delay = max(2000 / len(text), 10)
|
455 |
+
else:
|
456 |
+
delay = None
|
457 |
+
page.keyboard.type(text, delay=delay)
|
458 |
+
|
459 |
+
|
460 |
+
# https://playwright.dev/python/docs/api/class-keyboard#keyboard-insert-text
|
461 |
+
def keyboard_insert_text(text: str):
|
462 |
+
"""
|
463 |
+
Insert a string of text in the currently focused element. Dispatches only input
|
464 |
+
event, does not emit the keydown, keyup or keypress events. Modifier keys DO NOT
|
465 |
+
affect keyboard_insert_text. Holding down Shift will not type the text in upper
|
466 |
+
case.
|
467 |
+
|
468 |
+
Examples:
|
469 |
+
keyboard_insert_text('Hello world!')
|
470 |
+
"""
|
471 |
+
page.keyboard.insert_text(text)
|
472 |
+
|
473 |
+
|
474 |
+
# https://playwright.dev/python/docs/api/class-page#page-goto
|
475 |
+
def goto(url: str):
|
476 |
+
"""
|
477 |
+
Navigate to a url.
|
478 |
+
|
479 |
+
Examples:
|
480 |
+
goto('http://www.example.com')
|
481 |
+
"""
|
482 |
+
page.goto(url)
|
483 |
+
|
484 |
+
|
485 |
+
# https://playwright.dev/python/docs/api/class-page#page-go-back
|
486 |
+
def go_back():
|
487 |
+
"""
|
488 |
+
Navigate to the previous page in history.
|
489 |
+
|
490 |
+
Examples:
|
491 |
+
go_back()
|
492 |
+
"""
|
493 |
+
page.go_back()
|
494 |
+
|
495 |
+
|
496 |
+
# https://playwright.dev/python/docs/api/class-page#page-go-forward
|
497 |
+
def go_forward():
|
498 |
+
"""
|
499 |
+
Navigate to the next page in history.
|
500 |
+
|
501 |
+
Examples:
|
502 |
+
go_forward()
|
503 |
+
"""
|
504 |
+
page.go_forward()
|
505 |
+
|
506 |
+
|
507 |
+
# https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page
|
508 |
+
def new_tab():
|
509 |
+
"""
|
510 |
+
Open a new tab. It will become the active one.
|
511 |
+
|
512 |
+
Examples:
|
513 |
+
new_tab()
|
514 |
+
"""
|
515 |
+
global page
|
516 |
+
# set the new page as the active page
|
517 |
+
page = page.context.new_page()
|
518 |
+
# trigger the callback that sets this page as active in browsergym
|
519 |
+
page.evaluate(
|
520 |
+
"""\
|
521 |
+
const event = new Event('pageshow', {
|
522 |
+
bubbles: true, // Whether the event bubbles up through the DOM or not
|
523 |
+
cancelable: false // Whether the event can be canceled
|
524 |
+
});
|
525 |
+
window.dispatchEvent(event);
|
526 |
+
"""
|
527 |
+
)
|
528 |
+
|
529 |
+
|
530 |
+
# https://playwright.dev/python/docs/api/class-page#page-close
|
531 |
+
def tab_close():
|
532 |
+
"""
|
533 |
+
Close the current tab.
|
534 |
+
|
535 |
+
Examples:
|
536 |
+
tab_close()
|
537 |
+
"""
|
538 |
+
global page
|
539 |
+
context = page.context
|
540 |
+
page.close()
|
541 |
+
# set most recent page as active page, or open a new page if needed
|
542 |
+
if context.pages:
|
543 |
+
# TODO: do something more elaborate? (active page history)
|
544 |
+
page = context.pages[-1]
|
545 |
+
else:
|
546 |
+
page = context.new_page()
|
547 |
+
# trigger the callback that sets this page as active in browsergym
|
548 |
+
page.evaluate(
|
549 |
+
"""\
|
550 |
+
const event = new Event('pageshow', {
|
551 |
+
bubbles: true, // Whether the event bubbles up through the DOM or not
|
552 |
+
cancelable: false // Whether the event can be canceled
|
553 |
+
});
|
554 |
+
window.dispatchEvent(event);
|
555 |
+
"""
|
556 |
+
)
|
557 |
+
|
558 |
+
|
559 |
+
# https://playwright.dev/python/docs/api/class-page#page-bring-to-front
|
560 |
+
def tab_focus(index: int):
|
561 |
+
"""
|
562 |
+
Bring tab to front (activate tab).
|
563 |
+
|
564 |
+
Examples:
|
565 |
+
tab_focus(2)
|
566 |
+
"""
|
567 |
+
global page # set the focused page as the active page
|
568 |
+
page = page.context.pages[index]
|
569 |
+
page.bring_to_front()
|
570 |
+
# trigger the callback that sets this page as active in browsergym
|
571 |
+
page.evaluate(
|
572 |
+
"""\
|
573 |
+
const event = new Event('pageshow', {
|
574 |
+
bubbles: true, // Whether the event bubbles up through the DOM or not
|
575 |
+
cancelable: false // Whether the event can be canceled
|
576 |
+
});
|
577 |
+
window.dispatchEvent(event);
|
578 |
+
"""
|
579 |
+
)
|
580 |
+
|
581 |
+
|
582 |
+
# https://playwright.dev/python/docs/input#upload-files
|
583 |
+
def upload_file(bid: str, file: str | list[str]):
|
584 |
+
"""
|
585 |
+
Click an element and wait for a "filechooser" event, then select one
|
586 |
+
or multiple input files for upload. Relative file paths are resolved
|
587 |
+
relative to the current working directory. An empty list clears the
|
588 |
+
selected files.
|
589 |
+
|
590 |
+
Examples:
|
591 |
+
upload_file("572", "my_receipt.pdf")
|
592 |
+
upload_file("63", ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
|
593 |
+
"""
|
594 |
+
elem = get_elem_by_bid(page, bid, demo_mode != "off")
|
595 |
+
add_demo_mode_effects(page, elem, bid, demo_mode=demo_mode, move_cursor=True)
|
596 |
+
|
597 |
+
with page.expect_file_chooser() as fc_info:
|
598 |
+
elem.click(timeout=500)
|
599 |
+
|
600 |
+
file_chooser = fc_info.value
|
601 |
+
file_chooser.set_files(file)
|
602 |
+
|
603 |
+
|
604 |
+
# https://playwright.dev/python/docs/input#upload-files
|
605 |
+
def mouse_upload_file(x: float, y: float, file: str | list[str]):
|
606 |
+
"""
|
607 |
+
Click a location and wait for a "filechooser" event, then select one
|
608 |
+
or multiple input files for upload. Relative file paths are resolved
|
609 |
+
relative to the current working directory. An empty list clears the
|
610 |
+
selected files.
|
611 |
+
|
612 |
+
Examples:
|
613 |
+
mouse_upload_file(132.1, 547, "my_receipt.pdf")
|
614 |
+
mouse_upload_file(328, 812, ["/home/bob/Documents/image.jpg", "/home/bob/Documents/file.zip"])
|
615 |
+
"""
|
616 |
+
if demo_mode != "off":
|
617 |
+
smooth_move_visual_cursor_to(page, x, y)
|
618 |
+
highlight_by_box(page, {"x": x, "y": y, "width": 1, "height": 1})
|
619 |
+
|
620 |
+
with page.expect_file_chooser() as fc_info:
|
621 |
+
page.mouse.click(x, y)
|
622 |
+
|
623 |
+
file_chooser = fc_info.value
|
624 |
+
file_chooser.set_files(file)
|
BrowserGym/browsergym/core/src/browsergym/core/action/highlevel.py
ADDED
@@ -0,0 +1,522 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import inspect
|
2 |
+
import random
|
3 |
+
import typing
|
4 |
+
from dataclasses import dataclass
|
5 |
+
|
6 |
+
from . import utils
|
7 |
+
from .base import AbstractActionSet
|
8 |
+
from .functions import ( # check,; uncheck,
|
9 |
+
clear,
|
10 |
+
click,
|
11 |
+
dblclick,
|
12 |
+
drag_and_drop,
|
13 |
+
fill,
|
14 |
+
focus,
|
15 |
+
go_back,
|
16 |
+
go_forward,
|
17 |
+
goto,
|
18 |
+
hover,
|
19 |
+
keyboard_down,
|
20 |
+
keyboard_insert_text,
|
21 |
+
keyboard_press,
|
22 |
+
keyboard_type,
|
23 |
+
keyboard_up,
|
24 |
+
mouse_click,
|
25 |
+
mouse_dblclick,
|
26 |
+
mouse_down,
|
27 |
+
mouse_drag_and_drop,
|
28 |
+
mouse_move,
|
29 |
+
mouse_up,
|
30 |
+
mouse_upload_file,
|
31 |
+
new_tab,
|
32 |
+
noop,
|
33 |
+
press,
|
34 |
+
report_infeasible,
|
35 |
+
scroll,
|
36 |
+
select_option,
|
37 |
+
send_msg_to_user,
|
38 |
+
tab_close,
|
39 |
+
tab_focus,
|
40 |
+
upload_file,
|
41 |
+
)
|
42 |
+
from .parsers import action_docstring_parser, highlevel_action_parser
|
43 |
+
|
44 |
+
ACTION_SUBSETS = {
|
45 |
+
"chat": [send_msg_to_user],
|
46 |
+
"infeas": [report_infeasible],
|
47 |
+
"bid": [
|
48 |
+
scroll,
|
49 |
+
fill,
|
50 |
+
# These are not really needed and might pollute the action space, doing more harm than good
|
51 |
+
# check,
|
52 |
+
# uncheck,
|
53 |
+
select_option,
|
54 |
+
click,
|
55 |
+
dblclick,
|
56 |
+
hover,
|
57 |
+
press,
|
58 |
+
focus,
|
59 |
+
clear,
|
60 |
+
drag_and_drop,
|
61 |
+
upload_file,
|
62 |
+
],
|
63 |
+
"coord": [
|
64 |
+
scroll,
|
65 |
+
mouse_move,
|
66 |
+
mouse_up,
|
67 |
+
mouse_down,
|
68 |
+
mouse_click,
|
69 |
+
mouse_dblclick,
|
70 |
+
mouse_drag_and_drop,
|
71 |
+
mouse_upload_file,
|
72 |
+
keyboard_down,
|
73 |
+
keyboard_up,
|
74 |
+
keyboard_press,
|
75 |
+
keyboard_type,
|
76 |
+
keyboard_insert_text,
|
77 |
+
],
|
78 |
+
"nav": [go_back, go_forward, goto],
|
79 |
+
"tab": [
|
80 |
+
tab_close,
|
81 |
+
tab_focus,
|
82 |
+
new_tab,
|
83 |
+
],
|
84 |
+
# adapted from MiniWoB repo
|
85 |
+
# https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L122
|
86 |
+
"miniwob_all": [
|
87 |
+
mouse_move, # MOVE_COORDS
|
88 |
+
mouse_click, # CLICK_COORDS
|
89 |
+
mouse_dblclick, # DBLCLICK_COORDS
|
90 |
+
mouse_down, # MOUSEDOWN_COORDS
|
91 |
+
mouse_up, # MOUSEUP_COORDS
|
92 |
+
scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
|
93 |
+
click, # CLICK_ELEMENT
|
94 |
+
keyboard_press, # PRESS_KEY
|
95 |
+
keyboard_type, # TYPE_TEX (and substitute for TYPE_FIELD()
|
96 |
+
fill, # FOCUS_ELEMENT_AND_TYPE_TEXT (and substitute for FOCUS_ELEMENT_AND_TYPE_FIELD)
|
97 |
+
],
|
98 |
+
# adapted from MiniWoB repo
|
99 |
+
# https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L142
|
100 |
+
"miniwob_shi17": [
|
101 |
+
mouse_click, # CLICK_COORDS
|
102 |
+
mouse_dblclick, # DBLCLICK_COORDS
|
103 |
+
mouse_down, # MOUSEDOWN_COORDS
|
104 |
+
mouse_up, # MOUSEUP_COORDS
|
105 |
+
scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
|
106 |
+
keyboard_press, # PRESS_KEY
|
107 |
+
],
|
108 |
+
# adapted from MiniWoB repo
|
109 |
+
# https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L160
|
110 |
+
"miniwob_liu18": [
|
111 |
+
click, # CLICK_ELEMENT
|
112 |
+
fill, # substitute for FOCUS_ELEMENT_AND_TYPE_FIELD
|
113 |
+
],
|
114 |
+
# adapted from MiniWoB repo
|
115 |
+
# https://github.com/Farama-Foundation/miniwob-plusplus/blob/1bab0dffe34e92cc1049fe9443542029bf7e44a9/miniwob/action.py#L173
|
116 |
+
"miniwob_humphreys22": [
|
117 |
+
mouse_move, # MOVE_COORDS
|
118 |
+
mouse_click, # CLICK_COORDS
|
119 |
+
mouse_dblclick, # DBLCLICK_COORDS
|
120 |
+
mouse_down, # MOUSEDOWN_COORDS
|
121 |
+
mouse_up, # MOUSEUP_COORDS
|
122 |
+
scroll, # SCROLL_UP_COORDS, SCROLL_DOWN_COORDS
|
123 |
+
keyboard_press, # PRESS_KEY
|
124 |
+
keyboard_type, # substitute for TYPE_FIELD
|
125 |
+
],
|
126 |
+
# from the webarena paper
|
127 |
+
# https://arxiv.org/abs/2307.13854
|
128 |
+
# from the webarena source code
|
129 |
+
# https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/browser_env/actions.py#L240
|
130 |
+
# from the webarena default prompt
|
131 |
+
# https://github.com/web-arena-x/webarena/blob/e31c190c9b43f63e5724322b847e00249300df40/agent/prompts/raw/p_cot_id_actree_2s.py#L13
|
132 |
+
"webarena": [
|
133 |
+
# # code | paper | prompt
|
134 |
+
scroll, # SCROLL | scroll(dir) | scroll [down|up]
|
135 |
+
keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb]
|
136 |
+
# MOUSE_CLICK | |
|
137 |
+
# KEYBOARD_TYPE | |
|
138 |
+
# MOUSE_HOVER | |
|
139 |
+
click, # CLICK | click(elem) | click [id]
|
140 |
+
fill, # TYPE | type(elem, text) | type [id] [content]
|
141 |
+
hover, # HOVER | hover(elem) | hover [id]
|
142 |
+
tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index]
|
143 |
+
new_tab, # NEW_TAB | new_tab() | new_tab
|
144 |
+
go_back, # GO_BACK | go_back() | go_back
|
145 |
+
go_forward, # GO_FORWARD | go_forward() | go_forward
|
146 |
+
goto, # GOTO_URL | goto(url) | goto [url]
|
147 |
+
tab_close, # PAGE_CLOSE | tab_close() | close_tab
|
148 |
+
# CHECK | |
|
149 |
+
select_option, # SELECT_OPTION | |
|
150 |
+
send_msg_to_user, # STOP | stop(answer) | stop [answer]
|
151 |
+
report_infeasible, ## explicit unachievable action, equivalent STOP "N/A"
|
152 |
+
],
|
153 |
+
# from the visualwebarena paper
|
154 |
+
# https://arxiv.org/abs/2401.13649
|
155 |
+
# from the visualwebarena source code
|
156 |
+
# https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/browser_env/actions.py#L311-L343
|
157 |
+
# from the visualwebarena default prompt
|
158 |
+
# https://github.com/web-arena-x/visualwebarena/blob/15890922c97a8694e366fde2d7de8dbd1ff63fb5/agent/prompts/jsons/p_cot_id_actree_3s.json#L2
|
159 |
+
"visualwebarena": [
|
160 |
+
# # code | paper | prompt
|
161 |
+
scroll, # SCROLL | scroll(dir) | scroll [down|up]
|
162 |
+
keyboard_press, # KEY_PRESS | press(key_comb) | press [key_comb]
|
163 |
+
# MOUSE_CLICK | |
|
164 |
+
# KEYBOARD_TYPE | |
|
165 |
+
# MOUSE_HOVER | |
|
166 |
+
click, # CLICK | click(elem) | click [id]
|
167 |
+
fill, # TYPE | type(elem, text) | type [id] [content]
|
168 |
+
hover, # HOVER | hover(elem) | hover [id]
|
169 |
+
tab_focus, # PAGE_FOCUS | tab_focus(index) | tab_focus [tab_index]
|
170 |
+
new_tab, # NEW_TAB | new_tab() | new_tab
|
171 |
+
go_back, # GO_BACK | go_back() | go_back
|
172 |
+
go_forward, # GO_FORWARD | go_forward() | go_forward
|
173 |
+
goto, # GOTO_URL | goto(url) | goto [url]
|
174 |
+
tab_close, # PAGE_CLOSE | tab_close() | close_tab
|
175 |
+
# CHECK | |
|
176 |
+
select_option, # SELECT_OPTION | |
|
177 |
+
send_msg_to_user, # STOP | stop(answer) | stop [answer]
|
178 |
+
# CLEAR | |
|
179 |
+
upload_file, # UPLOAD | |
|
180 |
+
report_infeasible, ## explicit unachievable action, equivalent STOP "N/A"
|
181 |
+
],
|
182 |
+
# from workarena paper
|
183 |
+
# https://arxiv.org/abs/2403.07718
|
184 |
+
"workarena": [
|
185 |
+
scroll,
|
186 |
+
fill,
|
187 |
+
select_option,
|
188 |
+
click,
|
189 |
+
dblclick,
|
190 |
+
hover,
|
191 |
+
press,
|
192 |
+
focus,
|
193 |
+
clear,
|
194 |
+
drag_and_drop,
|
195 |
+
send_msg_to_user,
|
196 |
+
],
|
197 |
+
# from workarena++ paper
|
198 |
+
# https://arxiv.org/abs/2407.05291
|
199 |
+
"workarena++": [
|
200 |
+
scroll,
|
201 |
+
fill,
|
202 |
+
select_option,
|
203 |
+
click,
|
204 |
+
dblclick,
|
205 |
+
hover,
|
206 |
+
press,
|
207 |
+
focus,
|
208 |
+
clear,
|
209 |
+
drag_and_drop,
|
210 |
+
tab_focus,
|
211 |
+
new_tab,
|
212 |
+
tab_close,
|
213 |
+
go_back,
|
214 |
+
go_forward,
|
215 |
+
goto,
|
216 |
+
send_msg_to_user,
|
217 |
+
report_infeasible,
|
218 |
+
],
|
219 |
+
# from weblinx_browsergym
|
220 |
+
# https://github.com/McGill-NLP/agentlab-weblinx-mvp/blob/a91b6d19870c5187d252e70a2e2013511cc6f1d2/weblinx_browsergym/__init__.py#L274-L286
|
221 |
+
"weblinx": [
|
222 |
+
send_msg_to_user, # say(speaker="assistant", utterance=[str]) -> send_msg_to_user(text=[str])
|
223 |
+
click, # click(uid=[element id]) -> click(bid=[element id])
|
224 |
+
hover, # hover(uid=[element id]) -> hover(bid=[element id])
|
225 |
+
fill, # textinput(uid=[element id], value=[str]) -> fill(bid=[element id], value=[str])
|
226 |
+
# change(uid=[element], value=[str]) -> ❌
|
227 |
+
goto, # load(url=[link]) -> goto(url=[link])
|
228 |
+
# submit(uid=[element]) -> click(bid=[element id])
|
229 |
+
scroll, # scroll(x=[int x],y=[int y]) -> scroll(delta_x=[int x], delta_y=[int y])
|
230 |
+
# copy(uid=[element],text=[str]) -> ❌
|
231 |
+
# paste(uid=[element],text=[str]) -> ❌
|
232 |
+
new_tab, # tabcreate() -> new_tab()
|
233 |
+
tab_close, # tabremove(target=[tabId]) -> tab_close()
|
234 |
+
tab_focus, # tabswitch(origin=[origin tabId],target=[target tabId]) -> tab_focus(index=[target tabid])
|
235 |
+
],
|
236 |
+
# from assistantbench paper
|
237 |
+
# https://arxiv.org/abs/2407.15711
|
238 |
+
"assistantbench": [
|
239 |
+
scroll, # SCROLL
|
240 |
+
fill, # TYPE
|
241 |
+
select_option, # SELECT
|
242 |
+
click, # CLICK
|
243 |
+
press, # PRESS ENTER
|
244 |
+
go_back, # GOBACK
|
245 |
+
goto, # GOTO, SEARCH
|
246 |
+
send_msg_to_user, # TERMINATE
|
247 |
+
],
|
248 |
+
}
|
249 |
+
|
250 |
+
|
251 |
+
@dataclass
|
252 |
+
class HighLevelAction:
|
253 |
+
# entrypoint: callable
|
254 |
+
signature: str
|
255 |
+
description: str
|
256 |
+
examples: list[str]
|
257 |
+
|
258 |
+
|
259 |
+
class HighLevelActionSet(AbstractActionSet):
|
260 |
+
|
261 |
+
# static class variables
|
262 |
+
ActionSubset = typing.Literal[
|
263 |
+
"chat",
|
264 |
+
"infeas",
|
265 |
+
"bid",
|
266 |
+
"coord",
|
267 |
+
"nav",
|
268 |
+
"tab",
|
269 |
+
"miniwob_all",
|
270 |
+
"miniwob_shi17",
|
271 |
+
"miniwob_liu18",
|
272 |
+
"miniwob_humphreys22",
|
273 |
+
"webarena",
|
274 |
+
"visualwebarena",
|
275 |
+
"workarena",
|
276 |
+
"workarena++",
|
277 |
+
"weblinx",
|
278 |
+
"assistantbench",
|
279 |
+
"custom",
|
280 |
+
]
|
281 |
+
DemoMode = typing.Literal["off", "default", "all_blue", "only_visible_elements"]
|
282 |
+
|
283 |
+
def __init__(
|
284 |
+
self,
|
285 |
+
subsets: typing.Optional[ActionSubset | list[ActionSubset]] = [
|
286 |
+
"chat",
|
287 |
+
"infeas",
|
288 |
+
"bid",
|
289 |
+
"nav",
|
290 |
+
"tab",
|
291 |
+
],
|
292 |
+
custom_actions: typing.Optional[list[callable]] = None,
|
293 |
+
multiaction: bool = True,
|
294 |
+
demo_mode: typing.Optional[DemoMode] = None,
|
295 |
+
strict: bool = False,
|
296 |
+
retry_with_force: bool = False,
|
297 |
+
):
|
298 |
+
super().__init__(strict)
|
299 |
+
self.multiaction = multiaction
|
300 |
+
self.demo_mode = demo_mode
|
301 |
+
self.retry_with_force = retry_with_force
|
302 |
+
|
303 |
+
if not subsets:
|
304 |
+
raise ValueError(f"'action_subsets' is empty.")
|
305 |
+
|
306 |
+
if isinstance(subsets, str):
|
307 |
+
subsets = [subsets]
|
308 |
+
|
309 |
+
allowed_actions = [noop] # the noop action is always allowed
|
310 |
+
|
311 |
+
# add actions from specified action sets
|
312 |
+
if subsets:
|
313 |
+
for subset in subsets:
|
314 |
+
if subset in ACTION_SUBSETS:
|
315 |
+
allowed_actions.extend(ACTION_SUBSETS[subset])
|
316 |
+
elif subset == "custom":
|
317 |
+
if not custom_actions:
|
318 |
+
raise ValueError(
|
319 |
+
"'custom' is in 'action_subsets' but 'custom_actions' is empty."
|
320 |
+
)
|
321 |
+
allowed_actions.extend(custom_actions)
|
322 |
+
else:
|
323 |
+
raise ValueError(f"Unknown high-level action subspace: {subset}")
|
324 |
+
|
325 |
+
# like set() but preserves order
|
326 |
+
# https://stackoverflow.com/questions/1653970/does-python-have-an-ordered-set
|
327 |
+
allowed_actions = list(dict.fromkeys(allowed_actions).keys())
|
328 |
+
|
329 |
+
# parse the actions and build the action space
|
330 |
+
self.action_set: dict[str, HighLevelAction] = {}
|
331 |
+
self.python_includes = ""
|
332 |
+
|
333 |
+
# include playwright imports
|
334 |
+
self.python_includes += f"""\
|
335 |
+
import playwright.sync_api
|
336 |
+
from typing import Literal
|
337 |
+
|
338 |
+
|
339 |
+
"""
|
340 |
+
# set demo_mode and retry_with_force flags
|
341 |
+
self.python_includes += f"""\
|
342 |
+
demo_mode={repr(demo_mode)}
|
343 |
+
retry_with_force={repr(retry_with_force)}
|
344 |
+
|
345 |
+
if demo_mode is None:
|
346 |
+
demo_mode = "default" if DEMO_MODE else "off"
|
347 |
+
|
348 |
+
"""
|
349 |
+
|
350 |
+
# include utility functions
|
351 |
+
for _, func in inspect.getmembers(utils, inspect.isfunction):
|
352 |
+
self.python_includes += f"""\
|
353 |
+
{inspect.getsource(func)}
|
354 |
+
|
355 |
+
|
356 |
+
"""
|
357 |
+
|
358 |
+
# parse and include action functions
|
359 |
+
for func in allowed_actions:
|
360 |
+
|
361 |
+
# include action function definition in the code
|
362 |
+
self.python_includes += f"""\
|
363 |
+
{inspect.getsource(func)}
|
364 |
+
|
365 |
+
|
366 |
+
"""
|
367 |
+
|
368 |
+
# extract action signature
|
369 |
+
signature = f"{func.__name__}{inspect.signature(func)}"
|
370 |
+
|
371 |
+
# parse docstring
|
372 |
+
description, examples = action_docstring_parser.parse_string(func.__doc__)
|
373 |
+
|
374 |
+
# reconstruct action description
|
375 |
+
description = " ".join(description)
|
376 |
+
|
377 |
+
# reconstruct action examples
|
378 |
+
examples = [
|
379 |
+
function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")"
|
380 |
+
for function_name, function_args in examples
|
381 |
+
]
|
382 |
+
|
383 |
+
if func.__name__ in self.action_set:
|
384 |
+
raise ValueError(f"Duplicated action '{func.__name__}'")
|
385 |
+
|
386 |
+
self.action_set[func.__name__] = HighLevelAction(
|
387 |
+
# entrypoint=func,
|
388 |
+
signature=signature,
|
389 |
+
description=description,
|
390 |
+
examples=examples,
|
391 |
+
)
|
392 |
+
|
393 |
+
def example_action(self, abstract: bool, max_examples: int = 3) -> str:
|
394 |
+
"""
|
395 |
+
Returns an example action as a string.
|
396 |
+
"""
|
397 |
+
if abstract:
|
398 |
+
if self.multiaction:
|
399 |
+
return """\
|
400 |
+
One or several actions, separated by new lines."""
|
401 |
+
else:
|
402 |
+
return """\
|
403 |
+
One single action to be executed. You can only use one action at a time."""
|
404 |
+
else:
|
405 |
+
picked_examples = []
|
406 |
+
|
407 |
+
# use fill and click examples if action is present
|
408 |
+
for action_name in ["fill", "click", "mouse_click", "keyboard_type"]:
|
409 |
+
if action_name in self.action_set:
|
410 |
+
picked_examples.extend(self.action_set[action_name].examples)
|
411 |
+
|
412 |
+
# last resort, use all action examples
|
413 |
+
if not picked_examples:
|
414 |
+
for _, action in self.action_set.items():
|
415 |
+
picked_examples += action.examples
|
416 |
+
|
417 |
+
# shuffle examples
|
418 |
+
rng = random.Random(1)
|
419 |
+
rng.shuffle(picked_examples)
|
420 |
+
|
421 |
+
if self.multiaction:
|
422 |
+
return "\n".join(picked_examples[:max_examples])
|
423 |
+
else:
|
424 |
+
return picked_examples[0]
|
425 |
+
|
426 |
+
def describe(self, with_long_description: bool = True, with_examples: bool = True):
|
427 |
+
"""
|
428 |
+
Returns a textual description of this action space.
|
429 |
+
"""
|
430 |
+
description = f"""
|
431 |
+
{len(self.action_set)} different types of actions are available.
|
432 |
+
|
433 |
+
"""
|
434 |
+
for _, action in self.action_set.items():
|
435 |
+
description += f"""\
|
436 |
+
{action.signature}
|
437 |
+
"""
|
438 |
+
|
439 |
+
if with_long_description:
|
440 |
+
description += f"""\
|
441 |
+
Description: {action.description}
|
442 |
+
"""
|
443 |
+
if with_examples and action.examples:
|
444 |
+
description += f"""\
|
445 |
+
Examples:
|
446 |
+
"""
|
447 |
+
for example in action.examples:
|
448 |
+
description += f"""\
|
449 |
+
{example}
|
450 |
+
|
451 |
+
"""
|
452 |
+
|
453 |
+
if self.multiaction:
|
454 |
+
description += f"""\
|
455 |
+
Multiple actions can be provided at once, but will be executed sequentially without any feedback from the page.
|
456 |
+
More than 2-3 actions usually leads to failure or unexpected behavior."""
|
457 |
+
else:
|
458 |
+
description += f"""\
|
459 |
+
Only a single action can be provided at once."""
|
460 |
+
|
461 |
+
example_action = self.example_action(abstract=False)
|
462 |
+
if example_action:
|
463 |
+
description += f""" Example:
|
464 |
+
{example_action}
|
465 |
+
"""
|
466 |
+
else:
|
467 |
+
description += f"""\
|
468 |
+
|
469 |
+
"""
|
470 |
+
|
471 |
+
return description
|
472 |
+
|
473 |
+
def to_python_code(self, action):
|
474 |
+
"""
|
475 |
+
Converts the given high-level action string to browsergym-compatible python code.
|
476 |
+
|
477 |
+
Args:
|
478 |
+
action: the high-level action to parse.
|
479 |
+
|
480 |
+
Returns:
|
481 |
+
Executable python code that performs the action in a browsergym environment.
|
482 |
+
"""
|
483 |
+
highlevel_code = action
|
484 |
+
|
485 |
+
# do the actual parsing and convert each high-level action to
|
486 |
+
# the corresponding python function call
|
487 |
+
if self.strict:
|
488 |
+
function_calls = highlevel_action_parser.parse_string(highlevel_code, parse_all=True)
|
489 |
+
function_calls = function_calls.as_list()
|
490 |
+
else:
|
491 |
+
function_calls = highlevel_action_parser.search_string(
|
492 |
+
highlevel_code
|
493 |
+
) # allow for multiple matches, skip anything in-between
|
494 |
+
function_calls = sum(function_calls.as_list(), []) # unpack multiple matches
|
495 |
+
|
496 |
+
if not function_calls:
|
497 |
+
raise ValueError("Received an empty action.")
|
498 |
+
elif len(function_calls) > 1 and not self.multiaction:
|
499 |
+
raise ValueError("Received a multi-action, only single-actions are allowed.")
|
500 |
+
|
501 |
+
python_code = ""
|
502 |
+
|
503 |
+
# function definitions
|
504 |
+
python_code += self.python_includes
|
505 |
+
|
506 |
+
# function calls
|
507 |
+
for function_name, function_args in function_calls:
|
508 |
+
if function_name not in self.action_set:
|
509 |
+
raise NameError(f"Invalid action type '{function_name}'.")
|
510 |
+
python_code += (
|
511 |
+
function_name + "(" + ", ".join([repr(arg) for arg in function_args]) + ")\n"
|
512 |
+
)
|
513 |
+
|
514 |
+
# return the constructed python code
|
515 |
+
return python_code
|
516 |
+
|
517 |
+
|
518 |
+
# consistency checks
|
519 |
+
assert "custom" not in ACTION_SUBSETS
|
520 |
+
assert set(typing.get_args(HighLevelActionSet.ActionSubset)) == set(
|
521 |
+
list(ACTION_SUBSETS.keys()) + ["custom"]
|
522 |
+
)
|
BrowserGym/browsergym/core/src/browsergym/core/action/parsers.py
ADDED
@@ -0,0 +1,92 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import ast
|
2 |
+
import pyparsing as pp
|
3 |
+
|
4 |
+
from dataclasses import dataclass
|
5 |
+
from typing import Any
|
6 |
+
|
7 |
+
|
8 |
+
@dataclass
|
9 |
+
class NamedArgument:
|
10 |
+
name: str
|
11 |
+
value: Any
|
12 |
+
|
13 |
+
def __repr__(self):
|
14 |
+
return f"{self.name}={repr(self.value)}"
|
15 |
+
|
16 |
+
|
17 |
+
def _build_highlevel_action_parser() -> pp.ParserElement:
|
18 |
+
"""
|
19 |
+
Returns:
|
20 |
+
An action parser that accepts Python-like function calls with string, number, list or dict literals as arguments.
|
21 |
+
Example:
|
22 |
+
func("a", 42, None, True, [2, 4, "s"], {"a_key": "a_value"}, )
|
23 |
+
The parser is loose and accepts multi-line or single-line combinations af calls.
|
24 |
+
Example:
|
25 |
+
func() func()
|
26 |
+
\tfunc()
|
27 |
+
Python comments are ignored.
|
28 |
+
Example:
|
29 |
+
# this is a comment
|
30 |
+
func() # this function call will be parsed
|
31 |
+
# func() # this one will not
|
32 |
+
The parser will return a list of (function_name, function_args) tuples, one for each function call in the input.
|
33 |
+
The parser will raise exceptions
|
34 |
+
|
35 |
+
"""
|
36 |
+
|
37 |
+
def make_keyword(kwd_str, kwd_value):
|
38 |
+
return pp.Keyword(kwd_str).set_parse_action(pp.replace_with(kwd_value))
|
39 |
+
|
40 |
+
TRUE = make_keyword("True", True)
|
41 |
+
FALSE = make_keyword("False", False)
|
42 |
+
NONE = make_keyword("None", None)
|
43 |
+
|
44 |
+
LBRACK, RBRACK, LBRACE, RBRACE, LPAREN, RPAREN, COLON = map(pp.Suppress, "[]{}():")
|
45 |
+
|
46 |
+
def literal_eval(toks):
|
47 |
+
return ast.literal_eval(toks[0])
|
48 |
+
|
49 |
+
string = pp.python_quoted_string().set_parse_action(literal_eval)
|
50 |
+
number = pp.pyparsing_common.number()
|
51 |
+
dict = pp.Forward().set_name("dict") # will be defined later
|
52 |
+
list = pp.Forward().set_name("list") # will be defined later
|
53 |
+
_tuple = pp.Forward().set_name("tuple") # will be defined later
|
54 |
+
element = (string | number | dict | list | _tuple | TRUE | FALSE | NONE).set_name("element")
|
55 |
+
|
56 |
+
list_items = pp.DelimitedList(element, allow_trailing_delim=True).set_name(None)
|
57 |
+
list << pp.Group(LBRACK + pp.Optional(list_items) + RBRACK, aslist=True)
|
58 |
+
_tuple << pp.Group(LPAREN + pp.Optional(list_items) + RPAREN, aslist=True).set_parse_action(
|
59 |
+
lambda tokens: tuple(tokens[0])
|
60 |
+
)
|
61 |
+
|
62 |
+
dict_item = pp.Group(string + COLON + element, aslist=True).set_name("dict item")
|
63 |
+
dict_items = pp.DelimitedList(dict_item, allow_trailing_delim=True).set_name(None)
|
64 |
+
dict << pp.Dict(LBRACE + pp.Optional(dict_items) + RBRACE, asdict=True)
|
65 |
+
|
66 |
+
arg = element
|
67 |
+
list_args = pp.DelimitedList(arg, allow_trailing_delim=True).set_name(None)
|
68 |
+
named_arg = (pp.pyparsing_common.identifier() + pp.Literal("=") + element).set_parse_action(
|
69 |
+
lambda tokens: NamedArgument(name=tokens[0], value=tokens[2])
|
70 |
+
)
|
71 |
+
list_named_args = pp.DelimitedList(named_arg, allow_trailing_delim=True).set_name(None)
|
72 |
+
function_call = pp.pyparsing_common.identifier() + pp.Group(
|
73 |
+
LPAREN + pp.Optional(list_args) + pp.Optional(list_named_args) + RPAREN, aslist=True
|
74 |
+
)
|
75 |
+
|
76 |
+
multiple_function_calls = pp.DelimitedList(pp.Group(function_call), delim="")
|
77 |
+
multiple_function_calls.ignore(pp.python_style_comment())
|
78 |
+
|
79 |
+
parser = multiple_function_calls
|
80 |
+
|
81 |
+
return parser
|
82 |
+
|
83 |
+
|
84 |
+
# this one will be used to extract python-like function calls
|
85 |
+
highlevel_action_parser: pp.ParserElement = _build_highlevel_action_parser()
|
86 |
+
|
87 |
+
# this one will be used to process the docstring in high-level actions, in order to describe the action space
|
88 |
+
action_docstring_parser: pp.ParserElement = (
|
89 |
+
pp.Group(pp.OneOrMore(pp.Word(pp.printables), stop_on=pp.Literal("Examples:")))
|
90 |
+
+ pp.Literal("Examples:").suppress()
|
91 |
+
+ pp.Group(highlevel_action_parser)
|
92 |
+
)
|
BrowserGym/browsergym/core/src/browsergym/core/action/python.py
ADDED
@@ -0,0 +1,112 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
|
3 |
+
from .base import AbstractActionSet
|
4 |
+
|
5 |
+
|
6 |
+
class PythonActionSet(AbstractActionSet):
|
7 |
+
def describe(self, with_long_description: bool = True, with_examples: bool = True):
|
8 |
+
"""
|
9 |
+
Returns a textual description of this action space.
|
10 |
+
"""
|
11 |
+
description = f"""
|
12 |
+
Each action consists of executable Python code (python>=3.10) that uses the Playwright library (playwright==1.32)
|
13 |
+
to interact with the current webpage and the browser context. The currently active webpage is accessible via the
|
14 |
+
global variable `page`. A function `send_message_to_user(text)` is also accessible and can be used to send a
|
15 |
+
message to the user, as well as a function `report_infeasible_instructions(reason)` to notify the user when their
|
16 |
+
instructions are infeasible."""
|
17 |
+
if with_long_description:
|
18 |
+
description += f"""
|
19 |
+
The browser context is in `page.context`, and all open webpages (tabs and popups)
|
20 |
+
are in `page.context.pages`. Here is is an example of a valid action:
|
21 |
+
```
|
22 |
+
frame = page.frame_locator(".result-frame")
|
23 |
+
button = frame.get_by_text("Submit")
|
24 |
+
button.click()
|
25 |
+
```
|
26 |
+
Here is another example:
|
27 |
+
```
|
28 |
+
frame = page.get_by_test_id("a").frame_locator(":scope")
|
29 |
+
frame.get_by_test_id("a776").click()
|
30 |
+
```
|
31 |
+
Note that Playwright's `get_by_test_id()` method is configured to use the `bid` attribute to locate HTML elements,
|
32 |
+
instead of the default `data-testid`. Also, Playwright's locators can not traverse iframes, so you have to locate
|
33 |
+
parent iframes first in order to locate an element in an iframe. The `bid` attribute contains all the information
|
34 |
+
required to recursively locate an element. For example, an element with `bid="ac2"` can be retrieved as follows:
|
35 |
+
```
|
36 |
+
frame = page.get_by_test_id("a").frame_locator(":scope")
|
37 |
+
frame = frame.get_by_test_id("ac").frame_locator(":scope")
|
38 |
+
elem = frame.get_by_test_id("ac2")
|
39 |
+
```
|
40 |
+
"""
|
41 |
+
else:
|
42 |
+
description += f"""\
|
43 |
+
|
44 |
+
"""
|
45 |
+
if with_examples:
|
46 |
+
description += f"""\
|
47 |
+
Here are other examples of valid actions:
|
48 |
+
```
|
49 |
+
page = page.context.new_page()
|
50 |
+
page.goto("https://www.wikipedia.org/")
|
51 |
+
```
|
52 |
+
```
|
53 |
+
page.get_by_label("Birth date").fill("2020-02-02")
|
54 |
+
page.get_by_role("link", name="Get started").click()
|
55 |
+
```
|
56 |
+
```
|
57 |
+
page.get_by_label('I agree to the terms above').check()
|
58 |
+
```
|
59 |
+
```
|
60 |
+
page.locator('#area').fill('Hello World!')
|
61 |
+
```
|
62 |
+
```
|
63 |
+
page.get_by_role("textbox").press("Control+ArrowRight")
|
64 |
+
```
|
65 |
+
```
|
66 |
+
send_message_to_user("There are 7 items to choose from.")
|
67 |
+
```
|
68 |
+
```
|
69 |
+
report_infeasible_instructions("I cannot follow these instructions because there is no email field in this form.")
|
70 |
+
```
|
71 |
+
"""
|
72 |
+
|
73 |
+
return description
|
74 |
+
|
75 |
+
def example_action(self, abstract: bool) -> str:
|
76 |
+
"""
|
77 |
+
Returns an example action as a string.
|
78 |
+
"""
|
79 |
+
if abstract:
|
80 |
+
return """\
|
81 |
+
One single bloc of Python code. Do not include any explanation, only valid Python code."""
|
82 |
+
else:
|
83 |
+
return """\
|
84 |
+
frame = page.get_by_test_id("b").frame_locator(":scope")
|
85 |
+
frame = page.get_by_test_id("ba").frame_locator(":scope")
|
86 |
+
frame.get_by_test_id("ba2").fill("Hello world!")
|
87 |
+
frame.get_by_test_id("ba3").click()
|
88 |
+
"""
|
89 |
+
|
90 |
+
def to_python_code(self, action):
|
91 |
+
"""
|
92 |
+
Converts the given code action string to browsergym-compatible playwright code.
|
93 |
+
|
94 |
+
Args:
|
95 |
+
action: the code action to parse.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
Executable playwright code that performs the action in a browsergym environment.
|
99 |
+
"""
|
100 |
+
|
101 |
+
python_code = ""
|
102 |
+
|
103 |
+
# extract markdown-style code snippets if detected
|
104 |
+
pattern = re.compile(r"```(?:python)?\n(?P<code>[\s\S]*?)```")
|
105 |
+
if pattern.match(action):
|
106 |
+
python_code += "\n".join([match.group("code") for match in pattern.finditer(action)])
|
107 |
+
# otherwise just use the code action as is
|
108 |
+
else:
|
109 |
+
python_code += action
|
110 |
+
|
111 |
+
# return the produced playwright code
|
112 |
+
return python_code
|
BrowserGym/browsergym/core/src/browsergym/core/action/utils.py
ADDED
@@ -0,0 +1,288 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from typing import Literal
|
2 |
+
|
3 |
+
import playwright.sync_api
|
4 |
+
|
5 |
+
|
6 |
+
def get_elem_by_bid(
|
7 |
+
page: playwright.sync_api.Page, bid: str, scroll_into_view: bool = False
|
8 |
+
) -> playwright.sync_api.Locator:
|
9 |
+
"""
|
10 |
+
Parse the given bid to sequentially locate every nested frame leading to the bid, then
|
11 |
+
locate the bid element. Bids are expected to take the form "abDb123", which means
|
12 |
+
the element abDb123 is located inside frame abDAb, which is located inside frame abDA,
|
13 |
+
which is located inside frame a, which is located inside the page's main frame.
|
14 |
+
|
15 |
+
Args:
|
16 |
+
bid: the browsergym id (playwright testid) of the page element.
|
17 |
+
scroll_into_view: try to scroll element into view, unless it is completely visible.
|
18 |
+
|
19 |
+
Returns:
|
20 |
+
Playwright element.
|
21 |
+
Bounding box of the element.
|
22 |
+
"""
|
23 |
+
if not isinstance(bid, str):
|
24 |
+
raise ValueError(f"expected a string, got {repr(bid)}")
|
25 |
+
|
26 |
+
current_frame = page
|
27 |
+
|
28 |
+
# dive into each nested frame, to the frame where the element is located
|
29 |
+
i = 0
|
30 |
+
while bid[i:] and not bid[i:].isnumeric():
|
31 |
+
i += 1
|
32 |
+
# allow multi-character frame ids such as aA, bCD etc.
|
33 |
+
while bid[i:] and bid[i].isalpha() and bid[i].isupper():
|
34 |
+
i += 1
|
35 |
+
frame_bid = bid[:i] # bid of the next frame to select
|
36 |
+
frame_elem = current_frame.get_by_test_id(frame_bid)
|
37 |
+
if not frame_elem.count():
|
38 |
+
raise ValueError(f'Could not find element with bid "{bid}"')
|
39 |
+
if scroll_into_view:
|
40 |
+
frame_elem.scroll_into_view_if_needed(timeout=500)
|
41 |
+
current_frame = frame_elem.frame_locator(":scope")
|
42 |
+
|
43 |
+
# finally, we should have selected the frame where the target element is
|
44 |
+
elem = current_frame.get_by_test_id(bid)
|
45 |
+
if not elem.count():
|
46 |
+
raise ValueError(f'Could not find element with bid "{bid}"')
|
47 |
+
if scroll_into_view:
|
48 |
+
elem.scroll_into_view_if_needed(timeout=500)
|
49 |
+
return elem
|
50 |
+
|
51 |
+
|
52 |
+
def highlight_by_box(
|
53 |
+
page: playwright.sync_api.Page, box: dict, color: Literal["blue", "red"] = "blue"
|
54 |
+
):
|
55 |
+
"""Highlights the target element based on its bounding box attributes."""
|
56 |
+
|
57 |
+
assert color in ("blue", "red")
|
58 |
+
|
59 |
+
if box:
|
60 |
+
left, top, width, height = box["x"], box["y"], box["width"], box["height"]
|
61 |
+
page.evaluate(
|
62 |
+
f"""\
|
63 |
+
const overlay = document.createElement('div');
|
64 |
+
document.body.appendChild(overlay);
|
65 |
+
overlay.setAttribute('style', `
|
66 |
+
all: initial;
|
67 |
+
position: fixed;
|
68 |
+
border: 2px solid transparent; /* Start with transparent border */
|
69 |
+
borderRadius: 10px; /* Add rounded corners */
|
70 |
+
boxShadow: 0 0 0px {color}; /* Initial boxShadow with 0px spread */
|
71 |
+
left: {left - 2}px; /* Adjust left position to accommodate initial shadow spread */
|
72 |
+
top: {top - 2}px; /* Adjust top position likewise */
|
73 |
+
width: {width}px;
|
74 |
+
height: {height}px;
|
75 |
+
z-index: 2147483646; /* Maximum value - 1 */
|
76 |
+
pointerEvents: none; /* Ensure the overlay does not interfere with user interaction */
|
77 |
+
`);
|
78 |
+
|
79 |
+
// Animate the boxShadow to create a "wave" effect
|
80 |
+
let spread = 0; // Initial spread radius of the boxShadow
|
81 |
+
const waveInterval = setInterval(() => {{
|
82 |
+
spread += 10; // Increase the spread radius to simulate the wave moving outward
|
83 |
+
overlay.style.boxShadow = `0 0 40px ${{spread}}px {color}`; // Update boxShadow to new spread radius
|
84 |
+
overlay.style.opacity = 1 - spread / 38; // Gradually decrease opacity to fade out the wave
|
85 |
+
if (spread >= 38) {{ // Assuming 76px ~ 2cm spread radius
|
86 |
+
clearInterval(waveInterval); // Stop the animation once the spread radius reaches 2cm
|
87 |
+
document.body.removeChild(overlay); // Remove the overlay from the document
|
88 |
+
}}
|
89 |
+
}}, 200); // Adjust the interval as needed to control the speed of the wave animation
|
90 |
+
"""
|
91 |
+
)
|
92 |
+
# Wait a bit to let users see the highlight
|
93 |
+
page.wait_for_timeout(1000) # Adjust delay as needed
|
94 |
+
|
95 |
+
|
96 |
+
def smooth_move_visual_cursor_to(
|
97 |
+
page: playwright.sync_api.Page, x: float, y: float, speed: float = 400
|
98 |
+
):
|
99 |
+
"""
|
100 |
+
Smoothly moves the visual cursor to a specific point, with constant
|
101 |
+
movement speed.
|
102 |
+
|
103 |
+
Args:
|
104 |
+
x: target location X coordinate (in viewport pixels)
|
105 |
+
y: target location Y coordinate (in viewport pixels)
|
106 |
+
speed: cursor speed (in pixels per second)
|
107 |
+
"""
|
108 |
+
movement_time = page.evaluate(
|
109 |
+
"""\
|
110 |
+
([targetX, targetY, speed]) => {
|
111 |
+
|
112 |
+
// create cursor if needed
|
113 |
+
if (!("browsergym_visual_cursor" in window)) {
|
114 |
+
if (window.trustedTypes && window.trustedTypes.createPolicy) {
|
115 |
+
window.trustedTypes.createPolicy('default', {
|
116 |
+
createHTML: (string, sink) => string
|
117 |
+
});
|
118 |
+
}
|
119 |
+
let cursor = document.createElement('div');
|
120 |
+
cursor.setAttribute('id', 'browsergym-visual-cursor');
|
121 |
+
cursor.innerHTML = `
|
122 |
+
<svg width="50px" height="50px" viewBox="213 106 713 706" fill="none" xmlns="http://www.w3.org/2000/svg">
|
123 |
+
<path d="M213.333 106.667L426.667 853.333 512 512 853.333 426.667 213.333 106.667z" fill="blue"/>
|
124 |
+
</svg>
|
125 |
+
`;
|
126 |
+
cursor.setAttribute('style', `
|
127 |
+
all: initial;
|
128 |
+
position: fixed;
|
129 |
+
opacity: 0.7; /* Slightly transparent */
|
130 |
+
z-index: 2147483647; /* Maximum value */
|
131 |
+
pointer-events: none; /* Ensures the SVG doesn't interfere with page interactions */
|
132 |
+
`);
|
133 |
+
|
134 |
+
// Calculate center position within the viewport
|
135 |
+
const centerX = window.innerWidth / 2;
|
136 |
+
const centerY = window.innerHeight / 2;
|
137 |
+
|
138 |
+
cursor.style.left = `${centerX}px`;
|
139 |
+
cursor.style.top = `${centerY}px`;
|
140 |
+
|
141 |
+
// save cursor element
|
142 |
+
window.browsergym_visual_cursor = cursor;
|
143 |
+
window.browsergym_visual_cursor_n_owners = 0;
|
144 |
+
}
|
145 |
+
|
146 |
+
// recover cursor
|
147 |
+
let cursor = window.browsergym_visual_cursor;
|
148 |
+
|
149 |
+
// attach cursor to document
|
150 |
+
document.body.appendChild(cursor);
|
151 |
+
window.browsergym_visual_cursor_n_owners += 1;
|
152 |
+
|
153 |
+
x = parseFloat(cursor.style.left);
|
154 |
+
y = parseFloat(cursor.style.top);
|
155 |
+
|
156 |
+
dx = targetX - x;
|
157 |
+
dy = targetY - y;
|
158 |
+
dist = Math.hypot(dx, dy);
|
159 |
+
movement_time = (dist / speed) * 1000; // seconds to milliseconds
|
160 |
+
still_wait_time = 1000;
|
161 |
+
|
162 |
+
// Adjust steps based on distance to keep movement speed consistent
|
163 |
+
// 1 step per 10 pixels of distance, adjust as needed
|
164 |
+
steps = Math.max(1, Math.trunc(dist / 10));
|
165 |
+
|
166 |
+
step_dx = dx / steps;
|
167 |
+
step_dy = dy / steps;
|
168 |
+
step_dist = dist / steps;
|
169 |
+
step_wait_time = Math.max(10, movement_time / steps);
|
170 |
+
|
171 |
+
let step = 0;
|
172 |
+
let time_still = 0;
|
173 |
+
const cursorInterval = setInterval(() => {
|
174 |
+
// move cursor
|
175 |
+
if (step < steps) {
|
176 |
+
x += step_dx;
|
177 |
+
y += step_dy;
|
178 |
+
cursor.style.left = `${x}px`;
|
179 |
+
cursor.style.top = `${y}px`;
|
180 |
+
}
|
181 |
+
// still cursor (wait a bit)
|
182 |
+
else if (time_still < still_wait_time) {
|
183 |
+
time_still += step_wait_time;
|
184 |
+
}
|
185 |
+
// stop and detach cursor
|
186 |
+
else {
|
187 |
+
clearInterval(cursorInterval);
|
188 |
+
window.browsergym_visual_cursor_n_owners -= 1;
|
189 |
+
if (window.browsergym_visual_cursor_n_owners <= 0) {
|
190 |
+
document.body.removeChild(cursor);
|
191 |
+
|
192 |
+
}
|
193 |
+
}
|
194 |
+
step += 1;
|
195 |
+
}, step_wait_time);
|
196 |
+
|
197 |
+
return movement_time;
|
198 |
+
}""",
|
199 |
+
[x, y, speed],
|
200 |
+
)
|
201 |
+
page.wait_for_timeout(movement_time)
|
202 |
+
|
203 |
+
|
204 |
+
def check_for_overlay(
|
205 |
+
page: playwright.sync_api.Page, bid: str, element: playwright.sync_api.ElementHandle, box: dict
|
206 |
+
):
|
207 |
+
if not element:
|
208 |
+
return False
|
209 |
+
|
210 |
+
visibility = element.get_attribute("browsergym_visibility_ratio")
|
211 |
+
if visibility is not None:
|
212 |
+
return float(visibility) >= 0.5
|
213 |
+
|
214 |
+
"""Checks if a given element is the topmost element at its center position by default.
|
215 |
+
If check_corners is True, it checks if any of the corners is visible."""
|
216 |
+
if box:
|
217 |
+
# corners
|
218 |
+
points_to_check = [
|
219 |
+
(box["x"], box["y"]),
|
220 |
+
(box["x"] + box["width"], box["y"]),
|
221 |
+
(box["x"], box["y"] + box["height"]),
|
222 |
+
(box["x"] + box["width"], box["y"] + box["height"]),
|
223 |
+
]
|
224 |
+
|
225 |
+
for x, y in points_to_check:
|
226 |
+
# Execute JavaScript to find the topmost element at the point.
|
227 |
+
top_element = page.evaluate(
|
228 |
+
f"""() => {{
|
229 |
+
const el = document.elementFromPoint({x}, {y});
|
230 |
+
return el ? el.outerHTML : '';
|
231 |
+
}}"""
|
232 |
+
)
|
233 |
+
|
234 |
+
# Check if the topmost element is the element we're interested in.
|
235 |
+
if top_element and bid in top_element:
|
236 |
+
return True
|
237 |
+
|
238 |
+
return False
|
239 |
+
|
240 |
+
|
241 |
+
def add_demo_mode_effects(
|
242 |
+
page: playwright.sync_api.Page,
|
243 |
+
elem: playwright.sync_api.ElementHandle,
|
244 |
+
bid: str,
|
245 |
+
demo_mode: Literal["off", "default", "all_blue", "only_visible_elements"],
|
246 |
+
move_cursor: bool = True,
|
247 |
+
highlight_box: bool = True,
|
248 |
+
):
|
249 |
+
if demo_mode == "off":
|
250 |
+
return
|
251 |
+
|
252 |
+
"""Adds visual effects to the target element"""
|
253 |
+
box = elem.bounding_box()
|
254 |
+
# box = extract_bounds_cdp(page, bid)
|
255 |
+
if box:
|
256 |
+
center_x, center_y = box["x"] + box["width"] / 2, box["y"] + box["height"] / 2
|
257 |
+
is_top_element = check_for_overlay(page, bid, elem, box)
|
258 |
+
|
259 |
+
if demo_mode == "only_visible_elements":
|
260 |
+
if not is_top_element:
|
261 |
+
return
|
262 |
+
else:
|
263 |
+
color = "blue"
|
264 |
+
|
265 |
+
elif demo_mode == "default":
|
266 |
+
if is_top_element:
|
267 |
+
color = "blue"
|
268 |
+
else:
|
269 |
+
color = "red"
|
270 |
+
|
271 |
+
elif demo_mode == "all_blue":
|
272 |
+
color = "blue"
|
273 |
+
|
274 |
+
if move_cursor:
|
275 |
+
smooth_move_visual_cursor_to(page, center_x, center_y)
|
276 |
+
|
277 |
+
if highlight_box:
|
278 |
+
highlight_by_box(page, box, color=color)
|
279 |
+
|
280 |
+
|
281 |
+
def call_fun(fun: callable, retry_with_force: bool):
|
282 |
+
try:
|
283 |
+
fun(force=False)
|
284 |
+
except playwright.sync_api.TimeoutError as e:
|
285 |
+
if retry_with_force:
|
286 |
+
fun(force=True)
|
287 |
+
else:
|
288 |
+
raise e
|
BrowserGym/browsergym/core/src/browsergym/core/chat.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import base64
|
2 |
+
from pathlib import Path
|
3 |
+
from typing import Literal
|
4 |
+
import logging
|
5 |
+
import playwright.sync_api
|
6 |
+
import re
|
7 |
+
import time
|
8 |
+
|
9 |
+
from importlib import resources
|
10 |
+
|
11 |
+
from . import _get_global_playwright, chat_files
|
12 |
+
|
13 |
+
|
14 |
+
CHATBOX_DIR = resources.files(chat_files)
|
15 |
+
|
16 |
+
logger = logging.getLogger(__name__)
|
17 |
+
|
18 |
+
|
19 |
+
class Chat:
|
20 |
+
def __init__(
|
21 |
+
self, headless: bool, chat_size=(500, 800), record_video_dir=None, modern=True
|
22 |
+
) -> None:
|
23 |
+
self.messages = []
|
24 |
+
|
25 |
+
# create a new browser, browser context and page for the chat
|
26 |
+
pw: playwright.sync_api.Playwright = _get_global_playwright()
|
27 |
+
self.browser = pw.chromium.launch(
|
28 |
+
headless=headless, args=[f"--window-size={chat_size[0]},{chat_size[1]}"]
|
29 |
+
)
|
30 |
+
self.context = self.browser.new_context(
|
31 |
+
no_viewport=True,
|
32 |
+
record_video_dir=Path(record_video_dir) / "chat_video" if record_video_dir else None,
|
33 |
+
record_video_size=dict(width=chat_size[0], height=chat_size[1]),
|
34 |
+
)
|
35 |
+
self.page = self.context.new_page()
|
36 |
+
self.recording_start_time = time.time() if record_video_dir else None
|
37 |
+
|
38 |
+
# setup the chat page
|
39 |
+
self.page.expose_function(
|
40 |
+
"send_user_message", lambda msg: self._js_user_message_received_callback(msg=msg)
|
41 |
+
)
|
42 |
+
|
43 |
+
if modern:
|
44 |
+
self.page.set_content(get_chatbox_modern(CHATBOX_DIR))
|
45 |
+
else:
|
46 |
+
self.page.set_content(get_chatbox_classic(CHATBOX_DIR))
|
47 |
+
|
48 |
+
def _js_user_message_received_callback(self, msg: str):
|
49 |
+
"""Callback function for when a user message is received in the chatbox"""
|
50 |
+
utc_time = time.time()
|
51 |
+
self.messages.append({"role": "user", "timestamp": utc_time, "message": msg})
|
52 |
+
# returning a list as JS doesnt like tuples
|
53 |
+
return ["user", time.strftime("%H:%M", time.localtime(utc_time)), msg]
|
54 |
+
|
55 |
+
def add_message(
|
56 |
+
self, role: Literal["user", "user_image", "assistant", "info", "infeasible"], msg: str
|
57 |
+
):
|
58 |
+
"""Add a message to the chatbox and update the page accordingly."""
|
59 |
+
utc_time = time.time()
|
60 |
+
if role not in ("user", "user_image", "assistant", "info", "infeasible"):
|
61 |
+
raise ValueError(f"Invalid role: {role}")
|
62 |
+
if role in ("user", "user_image", "assistant", "infeasible"):
|
63 |
+
self.messages.append({"role": role, "timestamp": utc_time, "message": msg})
|
64 |
+
timestamp = time.strftime("%H:%M:%S", time.localtime(utc_time))
|
65 |
+
self.page.evaluate(f"addChatMessage({repr(role)}, {repr(timestamp)}, {repr(msg)});")
|
66 |
+
|
67 |
+
def wait_for_user_message(self):
|
68 |
+
logger.info("Waiting for message from user...")
|
69 |
+
# reset flag
|
70 |
+
self.page.evaluate("USER_MESSAGE_RECEIVED = false;")
|
71 |
+
# wait for flag to be raised
|
72 |
+
self.page.wait_for_function("USER_MESSAGE_RECEIVED", polling=100, timeout=0)
|
73 |
+
logger.info("Message received.")
|
74 |
+
|
75 |
+
def close(self):
|
76 |
+
self.context.close()
|
77 |
+
self.browser.close()
|
78 |
+
|
79 |
+
|
80 |
+
def get_chatbox_modern(chatbox_dir) -> str:
|
81 |
+
with open(chatbox_dir / "chatbox_modern.html", "r") as file:
|
82 |
+
chatbox_html = file.read()
|
83 |
+
|
84 |
+
return chatbox_html
|
85 |
+
|
86 |
+
|
87 |
+
def get_chatbox_classic(chatbox_dir) -> str:
|
88 |
+
with open(chatbox_dir / "chatbox.html", "r") as file:
|
89 |
+
chatbox_html = file.read()
|
90 |
+
with open(chatbox_dir / "assistant.png", "rb") as f:
|
91 |
+
image_base64 = base64.b64encode(f.read()).decode("utf-8")
|
92 |
+
|
93 |
+
assistant_image_url = f"data:image/png;base64,{image_base64}"
|
94 |
+
chatbox_html = re.sub("<ASSISTANT_IMAGE_URL>", assistant_image_url, chatbox_html)
|
95 |
+
return chatbox_html
|
BrowserGym/browsergym/core/src/browsergym/core/chat_files/chatbox.html
ADDED
@@ -0,0 +1,243 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<!DOCTYPE html>
|
2 |
+
<html lang="en">
|
3 |
+
|
4 |
+
<head>
|
5 |
+
<meta charset="UTF-8">
|
6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
7 |
+
<title>UI Assistant Chat</title>
|
8 |
+
<style>
|
9 |
+
.chat-container {
|
10 |
+
display: flex;
|
11 |
+
flex-flow: column;
|
12 |
+
position: fixed;
|
13 |
+
bottom: 0;
|
14 |
+
right: 0;
|
15 |
+
height: 100%;
|
16 |
+
width: 100%;
|
17 |
+
border: 1px solid black;
|
18 |
+
background-color: white;
|
19 |
+
padding: 0;
|
20 |
+
overflow: hidden;
|
21 |
+
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1);
|
22 |
+
font-family: 'Source Sans Pro', Arial, Helvetica, sans-serif;
|
23 |
+
}
|
24 |
+
|
25 |
+
.chat-header {
|
26 |
+
background-color: #032D42;
|
27 |
+
color: white;
|
28 |
+
padding: 5px;
|
29 |
+
padding-left: 15px;
|
30 |
+
text-align: center;
|
31 |
+
flex: 0 1 auto;
|
32 |
+
}
|
33 |
+
|
34 |
+
.chat-body {
|
35 |
+
padding: 10px;
|
36 |
+
overflow-y: auto;
|
37 |
+
display: flex;
|
38 |
+
flex-direction: column;
|
39 |
+
flex: 1 1 auto;
|
40 |
+
}
|
41 |
+
|
42 |
+
.chat-debug {
|
43 |
+
padding: 10px;
|
44 |
+
max-height: 30%;
|
45 |
+
overflow-y: auto;
|
46 |
+
display: flex;
|
47 |
+
flex-direction: column;
|
48 |
+
flex: 0 0 auto;
|
49 |
+
}
|
50 |
+
|
51 |
+
.chat-input-area {
|
52 |
+
display: flex;
|
53 |
+
flex-flow: row;
|
54 |
+
margin-top: 5px;
|
55 |
+
margin-top: 5px;
|
56 |
+
padding: 10px;
|
57 |
+
border-top: 1px solid #ddd;
|
58 |
+
flex: 0 1 50px;
|
59 |
+
}
|
60 |
+
|
61 |
+
.chat-input-area form {
|
62 |
+
display: flex;
|
63 |
+
width: 100%;
|
64 |
+
height: 100%;
|
65 |
+
}
|
66 |
+
|
67 |
+
.input-box {
|
68 |
+
padding: 5px;
|
69 |
+
margin-right: 10px;
|
70 |
+
border-radius: 5px;
|
71 |
+
border: 1px solid #ccc;
|
72 |
+
width: 100%;
|
73 |
+
}
|
74 |
+
|
75 |
+
.submit-button {
|
76 |
+
padding: 5px 10px;
|
77 |
+
border-radius: 5px;
|
78 |
+
background-color: #4CAF50;
|
79 |
+
color: white;
|
80 |
+
border: none;
|
81 |
+
align-self: center;
|
82 |
+
}
|
83 |
+
|
84 |
+
.message {
|
85 |
+
display: flex;
|
86 |
+
align-items: center;
|
87 |
+
margin: 0px;
|
88 |
+
padding: 0px;
|
89 |
+
}
|
90 |
+
|
91 |
+
.message p {
|
92 |
+
padding: 10px;
|
93 |
+
/* Added padding inside the bubble */
|
94 |
+
border-radius: 15px;
|
95 |
+
flex-grow: 1;
|
96 |
+
margin-top: 10;
|
97 |
+
margin-bottom: 0;
|
98 |
+
}
|
99 |
+
|
100 |
+
.chat-debug .message p {
|
101 |
+
padding: 0;
|
102 |
+
border-radius: 0;
|
103 |
+
flex-grow: 1;
|
104 |
+
margin-top: 0;
|
105 |
+
margin-bottom: 0;
|
106 |
+
}
|
107 |
+
|
108 |
+
.user-message {
|
109 |
+
background-color: #d1f4d1;
|
110 |
+
}
|
111 |
+
|
112 |
+
.assistant-message {
|
113 |
+
background-color: #e0e0e0;
|
114 |
+
}
|
115 |
+
|
116 |
+
.info-message {
|
117 |
+
background-color: #f0f0f0;
|
118 |
+
color: #707070;
|
119 |
+
font-size: 13px;
|
120 |
+
}
|
121 |
+
|
122 |
+
.assistant-image {
|
123 |
+
margin: 0px;
|
124 |
+
padding: 10px;
|
125 |
+
width: 40px;
|
126 |
+
}
|
127 |
+
</style>
|
128 |
+
</head>
|
129 |
+
|
130 |
+
<body>
|
131 |
+
|
132 |
+
|
133 |
+
|
134 |
+
<div class="chat-container">
|
135 |
+
<div class="chat-header">
|
136 |
+
<h2>BrowserGym</h2>
|
137 |
+
</div>
|
138 |
+
<div class="chat-body" id="chatBody"></div>
|
139 |
+
<div class="chat-debug" id="chatDebug"></div>
|
140 |
+
<div class="chat-input-area">
|
141 |
+
<form id="chatForm">
|
142 |
+
<textarea class="input-box" rows="2" id="inputBox"></textarea>
|
143 |
+
<input type="submit" class="submit-button" value="Send">
|
144 |
+
</form>
|
145 |
+
</div>
|
146 |
+
</div>
|
147 |
+
|
148 |
+
<script>
|
149 |
+
|
150 |
+
const assistant_image_data = "<ASSISTANT_IMAGE_URL>";
|
151 |
+
|
152 |
+
var USER_MESSAGE_RECEIVED = false;
|
153 |
+
|
154 |
+
function escapeHtml(unsafe) {
|
155 |
+
return unsafe
|
156 |
+
.replace(/&/g, "&")
|
157 |
+
.replace(/</g, "<")
|
158 |
+
.replace(/>/g, ">")
|
159 |
+
.replace(/"/g, """)
|
160 |
+
.replace(/'/g, "'");
|
161 |
+
}
|
162 |
+
|
163 |
+
function addChatMessage(role, msg) {
|
164 |
+
const chatBody = document.getElementById('chatBody');
|
165 |
+
const chatDebug = document.getElementById('chatDebug');
|
166 |
+
const msgContainer = document.createElement('div');
|
167 |
+
msgContainer.className = 'message';
|
168 |
+
|
169 |
+
const text = document.createElement('p');
|
170 |
+
text.innerHTML = escapeHtml(msg);
|
171 |
+
|
172 |
+
const assistant_img = document.createElement('img');
|
173 |
+
assistant_img.src = assistant_image_data;
|
174 |
+
assistant_img.alt = 'Assistant';
|
175 |
+
assistant_img.className = 'assistant-image';
|
176 |
+
|
177 |
+
|
178 |
+
switch (role) {
|
179 |
+
case "user":
|
180 |
+
text.className = 'user-message';
|
181 |
+
msgContainer.appendChild(text);
|
182 |
+
chatBody.appendChild(msgContainer);
|
183 |
+
break;
|
184 |
+
case "assistant":
|
185 |
+
text.className = 'assistant-message';
|
186 |
+
msgContainer.appendChild(assistant_img); // Add the image to the message container
|
187 |
+
msgContainer.appendChild(text);
|
188 |
+
chatBody.appendChild(msgContainer);
|
189 |
+
break;
|
190 |
+
case "info":
|
191 |
+
text.className = 'info-message';
|
192 |
+
text.innerHTML = msg;
|
193 |
+
msgContainer.appendChild(text);
|
194 |
+
// hide previous debug messages
|
195 |
+
for (const msg of chatDebug.children) {
|
196 |
+
msg.style.display = 'none';
|
197 |
+
}
|
198 |
+
chatDebug.appendChild(msgContainer);
|
199 |
+
break;
|
200 |
+
default:
|
201 |
+
throw new TypeError(`Illegal role "${role}".`);
|
202 |
+
}
|
203 |
+
|
204 |
+
chatBody.scrollTop = chatBody.scrollHeight;
|
205 |
+
|
206 |
+
if (role === "user") {
|
207 |
+
USER_MESSAGE_RECEIVED = true;
|
208 |
+
}
|
209 |
+
}
|
210 |
+
|
211 |
+
if (typeof send_user_message !== 'function') {
|
212 |
+
function send_user_message(msg) {
|
213 |
+
// This will be overloaded by playwright
|
214 |
+
}
|
215 |
+
}
|
216 |
+
|
217 |
+
const inputBox = document.getElementById('inputBox');
|
218 |
+
|
219 |
+
function send_msg(msg) {
|
220 |
+
if (msg.trim()) {
|
221 |
+
send_user_message(msg);
|
222 |
+
addChatMessage('user', msg);
|
223 |
+
inputBox.value = '';
|
224 |
+
}
|
225 |
+
}
|
226 |
+
|
227 |
+
inputBox.onkeypress = (e) => {
|
228 |
+
if (e.key === 'Enter' && !e.shiftKey) {
|
229 |
+
e.preventDefault();
|
230 |
+
send_msg(inputBox.value);
|
231 |
+
}
|
232 |
+
};
|
233 |
+
|
234 |
+
document.getElementById('chatForm').onsubmit = function (event) {
|
235 |
+
event.preventDefault();
|
236 |
+
send_msg(inputBox.value);
|
237 |
+
return false;
|
238 |
+
}
|
239 |
+
</script>
|
240 |
+
|
241 |
+
</body>
|
242 |
+
|
243 |
+
</html>
|