Spaces:

sarlinpe
/

OrienterNet

Running

App Files Files Community

sarlinpe commited on Jun 22, 2023

Commit

9665c2c

0 Parent(s):

Release

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.flake8 +3 -0
.gitignore +138 -0
CODE_OF_CONDUCT.md +80 -0
CONTRIBUTING.md +31 -0
LICENSE +1 -0
README.md +229 -0
assets/demo.jpg +0 -0
assets/teaser.svg +0 -0
demo.ipynb +0 -0
maploc/__init__.py +28 -0
maploc/conf/__init__.py +0 -0
maploc/conf/data/__init__.py +0 -0
maploc/conf/data/kitti.yaml +29 -0
maploc/conf/data/mapillary.yaml +40 -0
maploc/conf/model/image_encoder/global.yaml +9 -0
maploc/conf/model/image_encoder/resnet_fpn.yaml +7 -0
maploc/conf/model/image_encoder/vgg_unet.yaml +8 -0
maploc/conf/orienternet.yaml +34 -0
maploc/conf/overfit.yaml +17 -0
maploc/conf/training.yaml +22 -0
maploc/data/__init__.py +4 -0
maploc/data/dataset.py +264 -0
maploc/data/image.py +140 -0
maploc/data/kitti/dataset.py +306 -0
maploc/data/kitti/prepare.py +123 -0
maploc/data/kitti/test1_files.txt +0 -0
maploc/data/kitti/test2_files.txt +0 -0
maploc/data/kitti/train_files.txt +0 -0
maploc/data/kitti/utils.py +79 -0
maploc/data/mapillary/dataset.py +350 -0
maploc/data/mapillary/download.py +180 -0
maploc/data/mapillary/prepare.py +406 -0
maploc/data/mapillary/splits_MGL_13loc.json +0 -0
maploc/data/mapillary/utils.py +173 -0
maploc/data/sequential.py +61 -0
maploc/data/torch.py +111 -0
maploc/data/utils.py +60 -0
maploc/demo.py +209 -0
maploc/evaluation/kitti.py +89 -0
maploc/evaluation/mapillary.py +111 -0
maploc/evaluation/run.py +252 -0
maploc/evaluation/utils.py +40 -0
maploc/evaluation/viz.py +178 -0
maploc/models/__init__.py +34 -0
maploc/models/base.py +123 -0
maploc/models/bev_net.py +61 -0
maploc/models/bev_projection.py +91 -0
maploc/models/feature_extractor.py +231 -0
maploc/models/feature_extractor_v2.py +192 -0
maploc/models/map_encoder.py +66 -0

.flake8 ADDED Viewed

	@@ -0,0 +1,3 @@

+[flake8]
+max-line-length = 88
+extend-ignore = E203

.gitignore ADDED Viewed

	@@ -0,0 +1,138 @@

+datasets/
+experiments/
+outputs/
+*.mp4
+lsf*
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# vscode
+.vscode

CODE_OF_CONDUCT.md ADDED Viewed

	@@ -0,0 +1,80 @@

+# Code of Conduct
+## Our Pledge
+In the interest of fostering an open and welcoming environment, we as
+contributors and maintainers pledge to make participation in our project and
+our community a harassment-free experience for everyone, regardless of age, body
+size, disability, ethnicity, sex characteristics, gender identity and expression,
+level of experience, education, socio-economic status, nationality, personal
+appearance, race, religion, or sexual identity and orientation.
+## Our Standards
+Examples of behavior that contributes to creating a positive environment
+include:
+* Using welcoming and inclusive language
+* Being respectful of differing viewpoints and experiences
+* Gracefully accepting constructive criticism
+* Focusing on what is best for the community
+* Showing empathy towards other community members
+Examples of unacceptable behavior by participants include:
+* The use of sexualized language or imagery and unwelcome sexual attention or
+  advances
+* Trolling, insulting/derogatory comments, and personal or political attacks
+* Public or private harassment
+* Publishing others' private information, such as a physical or electronic
+  address, without explicit permission
+* Other conduct which could reasonably be considered inappropriate in a
+  professional setting
+## Our Responsibilities
+Project maintainers are responsible for clarifying the standards of acceptable
+behavior and are expected to take appropriate and fair corrective action in
+response to any instances of unacceptable behavior.
+Project maintainers have the right and responsibility to remove, edit, or
+reject comments, commits, code, wiki edits, issues, and other contributions
+that are not aligned to this Code of Conduct, or to ban temporarily or
+permanently any contributor for other behaviors that they deem inappropriate,
+threatening, offensive, or harmful.
+## Scope
+This Code of Conduct applies within all project spaces, and it also applies when
+an individual is representing the project or its community in public spaces.
+Examples of representing a project or community include using an official
+project e-mail address, posting via an official social media account, or acting
+as an appointed representative at an online or offline event. Representation of
+a project may be further defined and clarified by project maintainers.
+This Code of Conduct also applies outside the project spaces when there is a
+reasonable belief that an individual's behavior may have a negative impact on
+the project or its community.
+## Enforcement
+Instances of abusive, harassing, or otherwise unacceptable behavior may be
+reported by contacting the project team at <opensource-conduct@fb.com>. All
+complaints will be reviewed and investigated and will result in a response that
+is deemed necessary and appropriate to the circumstances. The project team is
+obligated to maintain confidentiality with regard to the reporter of an incident.
+Further details of specific enforcement policies may be posted separately.
+Project maintainers who do not follow or enforce the Code of Conduct in good
+faith may face temporary or permanent repercussions as determined by other
+members of the project's leadership.
+## Attribution
+This Code of Conduct is adapted from the [Contributor Covenant][homepage], version 1.4,
+available at https://www.contributor-covenant.org/version/1/4/code-of-conduct.html
+[homepage]: https://www.contributor-covenant.org
+For answers to common questions about this code of conduct, see
+https://www.contributor-covenant.org/faq

CONTRIBUTING.md ADDED Viewed

	@@ -0,0 +1,31 @@

+# Contributing to OrienterNet
+We want to make contributing to this project as easy and transparent as
+possible.
+## Pull Requests
+We actively welcome your pull requests.
+1. Fork the repo and create your branch from `main`.
+2. If you've added code that should be tested, add tests.
+3. If you've changed APIs, update the documentation.
+4. Ensure the test suite passes.
+5. Make sure your code lints.
+6. If you haven't already, complete the Contributor License Agreement ("CLA").
+## Contributor License Agreement ("CLA")
+In order to accept your pull request, we need you to submit a CLA. You only need
+to do this once to work on any of Facebook's open source projects.
+Complete your CLA here: <https://code.facebook.com/cla>
+## Issues
+We use GitHub issues to track public bugs. Please ensure your description is
+clear and has sufficient instructions to be able to reproduce the issue.
+Facebook has a [bounty program](https://www.facebook.com/whitehat/) for the safe
+disclosure of security bugs. In those cases, please go through the process
+outlined on that page and do not file a public issue.
+## License
+By contributing to OrienterNet, you agree that your contributions will be licensed
+under the LICENSE file in the root directory of this source tree.

LICENSE ADDED Viewed

	@@ -0,0 +1 @@


1	+ The MGL dataset is made available under the CC-BY-SA license following the data available on the Mapillary platform. The model implementation and the pre-trained weights follow a CC-BY-NC license. OpenStreetMap data follows its own license.

README.md ADDED Viewed

	@@ -0,0 +1,229 @@

+<p align="center">
+  <h1 align="center"><ins>OrienterNet</ins><br>Visual Localization in 2D Public Maps<br>with Neural Matching</h1>
+  <p align="center">
+    <a href="https://psarlin.com/">Paul-Edouard&nbsp;Sarlin</a>
+    ·
+    <a href="https://danieldetone.com/">Daniel&nbsp;DeTone</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=WhISCE4AAAAJ&hl=en">Tsun-Yi&nbsp;Yang</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=Ta4TDJoAAAAJ&hl=en">Armen&nbsp;Avetisyan</a>
+    ·
+    <a href="https://scholar.google.com/citations?hl=en&user=49_cCT8AAAAJ">Julian&nbsp;Straub</a>
+    <br>
+    <a href="https://tom.ai/">Tomasz&nbsp;Malisiewicz</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=484sccEAAAAJ&hl=en">Samuel&nbsp;Rota&nbsp;Bulo</a>
+    ·
+    <a href="https://scholar.google.com/citations?hl=en&user=MhowvPkAAAAJ">Richard&nbsp;Newcombe</a>
+    ·
+    <a href="https://scholar.google.com/citations?hl=en&user=CxbDDRMAAAAJ">Peter&nbsp;Kontschieder</a>
+    ·
+    <a href="https://scholar.google.com/citations?user=AGoNHcsAAAAJ&hl=en">Vasileios&nbsp;Balntas</a>
+  </p>
+  <h2 align="center">CVPR 2023</h2>
+	<h3 align="center"><a href="https://arxiv.org/pdf/2304.02009.pdf">Paper</a> | <a href="https://psarlin.com/orienternet">Project Page</a> | <a href="https://youtu.be/wglW8jnupSs">Video</a></h3>
+  <div align="center"></div>
+</p>
+<p align="center">
+    <a href="https://psarlin.com/orienternet"><img src="assets/teaser.svg" alt="teaser" width="60%"></a>
+    <br>
+    <em>OrienterNet is a deep neural network that can accurately localize an image<br>using the same 2D semantic maps that humans use to orient themselves.</em>
+</p>
+##
+This repository hosts the source code for OrienterNet, a research project by Meta Reality Labs. OrienterNet leverages the power of deep learning to provide accurate positioning of images using free and globally-available maps from OpenStreetMap. As opposed to complex existing algorithms that rely on 3D point clouds, OrienterNet estimates a position and orientation by matching a neural Bird's-Eye-View with 2D maps.
+## Installation
+OrienterNet requires Python >= 3.8 and [PyTorch](https://pytorch.org/).  To run the demo, clone this repo and install the minimal requirements:
+```bash
+git clone https://github.com/facebookresearch/OrienterNet
+python -m pip install -r requirements/demo.txt
+```
+To run the evaluation and training, install the full requirements:
+```bash
+python -m pip install -r requirements/full.txt
+```
+## Demo ➡️ [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1zH_2mzdB18BnJVq48ZvJhMorcRjrWAXI?usp=sharing)
+Check out the Jupyter notebook [`demo.ipynb`](./demo.ipynb) ([run it on Colab!](https://colab.research.google.com/drive/1zH_2mzdB18BnJVq48ZvJhMorcRjrWAXI?usp=sharing)) for a minimal demo - take a picture with your phone in any city and find its exact location in a few seconds!
+<p align="center">
+    <a href="./demo.ipynb"><img src="assets/demo.jpg" alt="demo" width="60%"></a>
+    <br>
+    <em>OrienterNet positions any image within a large area - try it with your own images!</em>
+</p>
+## Evaluation
+#### Mapillary Geo-Localization dataset
+<details>
+<summary>[Click to expand]</summary>
+To obtain the dataset:
+1. Create a developper account at [mapillary.com](https://www.mapillary.com/dashboard/developers) and obtain a free access token.
+2. Run the following script to download the data from Mapillary and prepare it:
+```bash
+python -m maploc.data.mapillary.prepare --token $YOUR_ACCESS_TOKEN
+```
+By default the data is written to the directory `./datasets/MGL/`. Then run the evaluation with the pre-trained model:
+```bash
+python -m maploc.evaluation.mapillary --experiment OrienterNet_MGL model.num_rotations=256
+```
+This downloads the pre-trained models if necessary. The results should be close to the following:
+```
+Recall xy_max_error: [14.37, 48.69, 61.7] at (1, 3, 5) m/°
+Recall yaw_max_error: [20.95, 54.96, 70.17] at (1, 3, 5) m/°
+```
+This requires a GPU with 11GB of memory. If you run into OOM issues, consider reducing the number of rotations (the default is 256):
+```bash
+python -m maploc.evaluation.mapillary --experiment OrienterNet_MGL \
+    model.num_rotations=128
+```
+To export visualizations for the first 100 examples:
+```bash
+python -m maploc.evaluation.mapillary --experiment OrienterNet_MGL \
+    --output_dir ./viz_MGL/ --num 100
+```
+To run the evaluation in sequential mode (by default with 10 frames):
+```bash
+python -m maploc.evaluation.mapillary --experiment OrienterNet_MGL --sequential
+```
+</details>
+#### KITTI dataset
+<details>
+<summary>[Click to expand]</summary>
+1. Download and prepare the dataset to `./datasets/kitti/`:
+```bash
+python -m maploc.data.kitti.prepare
+```
+2. Run the evaluation with the model trained on MGL:
+```bash
+python -m maploc.evaluation.kitti --experiment OrienterNet_MGL
+```
+You should expect the following results:
+```
+Recall directional_error: [[50.33, 85.18, 92.73], [24.38, 56.13, 67.98]] at (1, 3, 5) m/°
+Recall yaw_max_error: [29.22, 68.2, 84.49] at (1, 3, 5) m/°
+```
+You can similarly export some visual examples:
+```bash
+python -m maploc.evaluation.kitti --experiment OrienterNet_MGL \
+    --output_dir ./viz_KITTI/ --num 100
+```
+</details>
+#### Aria Detroit & Seattle
+We are currently unable to release the dataset used to evaluate OrienterNet in the CVPR 2023 paper.
+## Training
+#### MGL dataset
+We trained the model on the MGL dataset using 3x 3090 GPUs (24GB VRAM each) and a total batch size of 12 for 340k iterations (about 3-4 days) with the following command:
+```bash
+python -m maploc.train experiment.name=OrienterNet_MGL_reproduce
+```
+Feel free to use any other experiment name. Configurations are managed by [Hydra](https://hydra.cc/) and [OmegaConf](https://omegaconf.readthedocs.io) so any entry can be overridden from the command line. You may thus reduce the number of GPUs and the batch size via:
+```bash
+python -m maploc.train experiment.name=OrienterNet_MGL_reproduce
+  experiment.gpus=1 data.loading.train.batch_size=4
+```
+Be aware that this can reduce the overall performance. The checkpoints are written to `./experiments/experiment_name/`. Then run the evaluation:
+```bash
+# the best checkpoint:
+python -m maploc.evaluation.mapillary --experiment OrienterNet_MGL_reproduce
+# a specific checkpoint:
+python -m maploc.evaluation.mapillary \
+    --experiment OrienterNet_MGL_reproduce/checkpoint-step=340000.ckpt
+```
+#### KITTI
+To fine-tune a trained model on the KITTI dataset:
+```bash
+python -m maploc.train experiment.name=OrienterNet_MGL_kitti data=kitti \
+    training.finetune_from_checkpoint='"experiments/OrienterNet_MGL_reproduce/checkpoint-step=340000.ckpt"'
+```
+## Interactive development
+We provide several visualization notebooks:
+- [Visualize predictions on the MGL dataset](./notebooks/visualize_predictions_mgl.ipynb)
+- [Visualize predictions on the KITTI dataset](./notebooks/visualize_predictions_kitti.ipynb)
+- [Visualize sequential predictions](./notebooks/visualize_predictions_sequences.ipynb)
+## OpenStreetMap data
+<details>
+<summary>[Click to expand]</summary>
+To make sure that the results are consistent over time, we used OSM data downloaded from [Geofabrik](https://download.geofabrik.de/) in November 2021. By default, the dataset scripts `maploc.data.[mapillary,kitti].prepare` download pre-generated raster tiles. If you wish to use different OSM classes, you can pass `--generate_tiles`, which will download and use our prepared raw `.osm` XML files. You may alternatively download more recent files.
+</details>
+## License
+The MGL dataset is made available under the [CC-BY-SA](https://creativecommons.org/licenses/by-sa/4.0/) license following the data available on the Mapillary platform. The model implementation and the pre-trained weights follow a [CC-BY-NC](https://creativecommons.org/licenses/by-nc/2.0/) license. Keep in mind that OpenStreetMap [follows a different license](https://www.openstreetmap.org/copyright).
+## BibTex citation
+Please consider citing our work if you use any code from this repo or ideas presented in the paper:
+```
+@inproceedings{sarlin2023orienternet,
+  author    = {Paul-Edouard Sarlin and
+               Daniel DeTone and
+               Tsun-Yi Yang and
+               Armen Avetisyan and
+               Julian Straub and
+               Tomasz Malisiewicz and
+               Samuel Rota Bulo and
+               Richard Newcombe and
+               Peter Kontschieder and
+               Vasileios Balntas},
+  title     = {{OrienterNet: Visual Localization in 2D Public Maps with Neural Matching}},
+  booktitle = {CVPR},
+  year      = {2023},
+}
+```

assets/demo.jpg ADDED Viewed

assets/teaser.svg ADDED Viewed

demo.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

maploc/__init__.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from pathlib import Path
+import logging
+import pytorch_lightning  # noqa: F401
+formatter = logging.Formatter(
+    fmt="[%(asctime)s %(name)s %(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+handler = logging.StreamHandler()
+handler.setFormatter(formatter)
+handler.setLevel(logging.INFO)
+logger = logging.getLogger("maploc")
+logger.setLevel(logging.INFO)
+logger.addHandler(handler)
+logger.propagate = False
+pl_logger = logging.getLogger("pytorch_lightning")
+if len(pl_logger.handlers):
+    pl_logger.handlers[0].setFormatter(formatter)
+repo_dir = Path(__file__).parent.parent
+EXPERIMENTS_PATH = repo_dir / "experiments/"
+DATASETS_PATH = repo_dir / "datasets/"

maploc/conf/__init__.py ADDED Viewed

File without changes

maploc/conf/data/__init__.py ADDED Viewed

File without changes

maploc/conf/data/kitti.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+name: kitti
+loading:
+  train:
+    batch_size: 9
+    num_workers: ${.batch_size}
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+# make sure train and val locations are at least 5m apart
+selection_subset_val: furthest
+max_num_val: 500
+drop_train_too_close_to_val: 5.0
+# map data
+num_classes:
+  areas: 7
+  ways: 10
+  nodes: 33
+pixel_per_meter: 2
+crop_size_meters: 64
+max_init_error: 32
+# preprocessing
+target_focal_length: 256
+resize_image: [448, 160]  # multiple of 32 at f=256px
+# pad_to_multiple: 32
+rectify_pitch: true
+augmentation:
+  rot90: true
+  flip: true
+  image: {apply: true}

maploc/conf/data/mapillary.yaml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: mapillary
+scenes:
+  - sanfrancisco_soma
+  - sanfrancisco_hayes
+  - amsterdam
+  - berlin
+  - lemans
+  - montrouge
+  - toulouse
+  - nantes
+  - vilnius
+  - avignon
+  - helsinki
+  - milan
+  - paris
+split: splits_MGL_13loc.json
+loading:
+  train:
+    batch_size: 12
+    num_workers: ${.batch_size}
+  val:
+    batch_size: ${..train.batch_size}
+    num_workers: ${.batch_size}
+# map data
+num_classes:
+  areas: 7
+  ways: 10
+  nodes: 33
+pixel_per_meter: 2
+crop_size_meters: 64
+max_init_error: 48
+add_map_mask: true
+# preprocessing
+resize_image: 512
+pad_to_square: true
+rectify_pitch: true
+augmentation:
+  rot90: true
+  flip: true
+  image: {apply: true}

maploc/conf/model/image_encoder/global.yaml ADDED Viewed

	@@ -0,0 +1,9 @@

+name: feature_extractor
+backbone:
+  encoder: resnet18
+  pretrained: true
+  output_dim: ${...latent_dim}
+  output_scales: [5]
+  num_downsample: 5
+  decoder: null
+pooling: mean

maploc/conf/model/image_encoder/resnet_fpn.yaml ADDED Viewed

	@@ -0,0 +1,7 @@

+name: feature_extractor_v2
+backbone:
+  encoder: resnet50
+  pretrained: true
+  output_dim: ${...latent_dim}
+  num_downsample: null
+  remove_stride_from_first_conv: false

maploc/conf/model/image_encoder/vgg_unet.yaml ADDED Viewed

	@@ -0,0 +1,8 @@

+name: feature_extractor
+backbone:
+  encoder: vgg16
+  pretrained: true
+  output_dim: ${...latent_dim}
+  output_scales: [0]
+  num_downsample: 4
+  decoder: [512, 256, 256, 128]

maploc/conf/orienternet.yaml ADDED Viewed

	@@ -0,0 +1,34 @@

+defaults:
+  - data: mapillary
+  - model/image_encoder: resnet_fpn
+  - training
+  - _self_
+model:
+  name: orienternet
+  latent_dim: 128
+  matching_dim: 8
+  z_max: 32
+  x_max: 32
+  pixel_per_meter: ${data.pixel_per_meter}
+  num_scale_bins: 33
+  num_rotations: 64
+  image_encoder:
+    backbone:
+      encoder: resnet101
+  map_encoder:
+    embedding_dim: 16
+    output_dim: ${..matching_dim}
+    num_classes: ${data.num_classes}
+    backbone:
+      encoder: vgg19
+      pretrained: false
+      output_scales: [0]
+      num_downsample: 3
+      decoder: [128, 64, 64]
+      padding: replicate
+    unary_prior: false
+  bev_net:
+    num_blocks: 4
+    latent_dim: ${..latent_dim}
+    output_dim: ${..matching_dim}
+    confidence: true

maploc/conf/overfit.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+defaults:
+  - orienternet
+  - _self_
+data:
+  loading:
+    train:
+      batch_size: 6
+  random: false
+  split: null
+model:
+  freeze_batch_normalization: true
+training:
+  trainer:
+    overfit_batches: 1
+    val_check_interval: 1
+    log_every_n_steps: 1
+    limit_val_batches: 1

maploc/conf/training.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+experiment:
+  name: ???
+  gpus: 3
+  seed: 0
+training:
+  lr: 1e-4
+  lr_scheduler: null
+  finetune_from_checkpoint: null
+  trainer:
+    val_check_interval: 5000
+    log_every_n_steps: 100
+    limit_val_batches: 1000
+    max_steps: 500000
+    devices: ${experiment.gpus}
+  checkpointing:
+    monitor: "loss/total/val"
+    save_top_k: 5
+    mode: "min"
+hydra:
+  job:
+    name: ${experiment.name}
+    chdir: false

maploc/data/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .kitti.dataset import KittiDataModule
+from .mapillary.dataset import MapillaryDataModule
+modules = {"mapillary": MapillaryDataModule, "kitti": KittiDataModule}

maploc/data/dataset.py ADDED Viewed

	@@ -0,0 +1,264 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from copy import deepcopy
+from pathlib import Path
+from typing import Any, Dict, List
+import numpy as np
+import torch
+import torch.utils.data as torchdata
+import torchvision.transforms as tvf
+from omegaconf import DictConfig, OmegaConf
+from ..models.utils import deg2rad, rotmat2d
+from ..osm.tiling import TileManager
+from ..utils.geo import BoundaryBox
+from ..utils.io import read_image
+from ..utils.wrappers import Camera
+from .image import pad_image, rectify_image, resize_image
+from .utils import decompose_rotmat, random_flip, random_rot90
+class MapLocDataset(torchdata.Dataset):
+    default_cfg = {
+        "seed": 0,
+        "accuracy_gps": 15,
+        "random": True,
+        "num_threads": None,
+        # map
+        "num_classes": None,
+        "pixel_per_meter": "???",
+        "crop_size_meters": "???",
+        "max_init_error": "???",
+        "max_init_error_rotation": None,
+        "init_from_gps": False,
+        "return_gps": False,
+        "force_camera_height": None,
+        # pose priors
+        "add_map_mask": False,
+        "mask_radius": None,
+        "mask_pad": 1,
+        "prior_range_rotation": None,
+        # image preprocessing
+        "target_focal_length": None,
+        "reduce_fov": None,
+        "resize_image": None,
+        "pad_to_square": False,  # legacy
+        "pad_to_multiple": 32,
+        "rectify_pitch": True,
+        "augmentation": {
+            "rot90": False,
+            "flip": False,
+            "image": {
+                "apply": False,
+                "brightness": 0.5,
+                "contrast": 0.4,
+                "saturation": 0.4,
+                "hue": 0.5 / 3.14,
+            },
+        },
+    }
+    def __init__(
+        self,
+        stage: str,
+        cfg: DictConfig,
+        names: List[str],
+        data: Dict[str, Any],
+        image_dirs: Dict[str, Path],
+        tile_managers: Dict[str, TileManager],
+        image_ext: str = "",
+    ):
+        self.stage = stage
+        self.cfg = deepcopy(cfg)
+        self.data = data
+        self.image_dirs = image_dirs
+        self.tile_managers = tile_managers
+        self.names = names
+        self.image_ext = image_ext
+        tfs = []
+        if stage == "train" and cfg.augmentation.image.apply:
+            args = OmegaConf.masked_copy(
+                cfg.augmentation.image, ["brightness", "contrast", "saturation", "hue"]
+            )
+            tfs.append(tvf.ColorJitter(**args))
+        self.tfs = tvf.Compose(tfs)
+    def __len__(self):
+        return len(self.names)
+    def __getitem__(self, idx):
+        if self.stage == "train" and self.cfg.random:
+            seed = None
+        else:
+            seed = [self.cfg.seed, idx]
+        (seed,) = np.random.SeedSequence(seed).generate_state(1)
+        scene, seq, name = self.names[idx]
+        if self.cfg.init_from_gps:
+            latlon_gps = self.data["gps_position"][idx][:2].clone().numpy()
+            xy_w_init = self.tile_managers[scene].projection.project(latlon_gps)
+        else:
+            xy_w_init = self.data["t_c2w"][idx][:2].clone().double().numpy()
+        if "shifts" in self.data:
+            yaw = self.data["roll_pitch_yaw"][idx][-1]
+            R_c2w = rotmat2d((90 - yaw) / 180 * np.pi).float()
+            error = (R_c2w @ self.data["shifts"][idx][:2]).numpy()
+        else:
+            error = np.random.RandomState(seed).uniform(-1, 1, size=2)
+        xy_w_init += error * self.cfg.max_init_error
+        bbox_tile = BoundaryBox(
+            xy_w_init - self.cfg.crop_size_meters,
+            xy_w_init + self.cfg.crop_size_meters,
+        )
+        return self.get_view(idx, scene, seq, name, seed, bbox_tile)
+    def get_view(self, idx, scene, seq, name, seed, bbox_tile):
+        data = {
+            "index": idx,
+            "name": name,
+            "scene": scene,
+            "sequence": seq,
+        }
+        cam_dict = self.data["cameras"][scene][seq][self.data["camera_id"][idx]]
+        cam = Camera.from_dict(cam_dict).float()
+        if "roll_pitch_yaw" in self.data:
+            roll, pitch, yaw = self.data["roll_pitch_yaw"][idx].numpy()
+        else:
+            roll, pitch, yaw = decompose_rotmat(self.data["R_c2w"][idx].numpy())
+        image = read_image(self.image_dirs[scene] / (name + self.image_ext))
+        if "plane_params" in self.data:
+            # transform the plane parameters from world to camera frames
+            plane_w = self.data["plane_params"][idx]
+            data["ground_plane"] = torch.cat(
+                [rotmat2d(deg2rad(torch.tensor(yaw))) @ plane_w[:2], plane_w[2:]]
+            )
+        if self.cfg.force_camera_height is not None:
+            data["camera_height"] = torch.tensor(self.cfg.force_camera_height)
+        elif "camera_height" in self.data:
+            data["camera_height"] = self.data["height"][idx].clone()
+        # raster extraction
+        canvas = self.tile_managers[scene].query(bbox_tile)
+        xy_w_gt = self.data["t_c2w"][idx][:2].numpy()
+        uv_gt = canvas.to_uv(xy_w_gt)
+        uv_init = canvas.to_uv(bbox_tile.center)
+        raster = canvas.raster  # C, H, W
+        # Map augmentations
+        heading = np.deg2rad(90 - yaw)  # fixme
+        if self.stage == "train":
+            if self.cfg.augmentation.rot90:
+                raster, uv_gt, heading = random_rot90(raster, uv_gt, heading, seed)
+            if self.cfg.augmentation.flip:
+                image, raster, uv_gt, heading = random_flip(
+                    image, raster, uv_gt, heading, seed
+                )
+        yaw = 90 - np.rad2deg(heading)  # fixme
+        image, valid, cam, roll, pitch = self.process_image(
+            image, cam, roll, pitch, seed
+        )
+        # Create the mask for prior location
+        if self.cfg.add_map_mask:
+            data["map_mask"] = torch.from_numpy(self.create_map_mask(canvas))
+        if self.cfg.max_init_error_rotation is not None:
+            if "shifts" in self.data:
+                error = self.data["shifts"][idx][-1]
+            else:
+                error = np.random.RandomState(seed + 1).uniform(-1, 1)
+                error = torch.tensor(error, dtype=torch.float)
+            yaw_init = yaw + error * self.cfg.max_init_error_rotation
+            range_ = self.cfg.prior_range_rotation or self.cfg.max_init_error_rotation
+            data["yaw_prior"] = torch.stack([yaw_init, torch.tensor(range_)])
+        if self.cfg.return_gps:
+            gps = self.data["gps_position"][idx][:2].numpy()
+            xy_gps = self.tile_managers[scene].projection.project(gps)
+            data["uv_gps"] = torch.from_numpy(canvas.to_uv(xy_gps)).float()
+            data["accuracy_gps"] = torch.tensor(
+                min(self.cfg.accuracy_gps, self.cfg.crop_size_meters)
+            )
+        if "chunk_index" in self.data:
+            data["chunk_id"] = (scene, seq, self.data["chunk_index"][idx])
+        return {
+            **data,
+            "image": image,
+            "valid": valid,
+            "camera": cam,
+            "canvas": canvas,
+            "map": torch.from_numpy(np.ascontiguousarray(raster)).long(),
+            "uv": torch.from_numpy(uv_gt).float(),  # TODO: maybe rename to uv?
+            "uv_init": torch.from_numpy(uv_init).float(),  # TODO: maybe rename to uv?
+            "roll_pitch_yaw": torch.tensor((roll, pitch, yaw)).float(),
+            "pixels_per_meter": torch.tensor(canvas.ppm).float(),
+        }
+    def process_image(self, image, cam, roll, pitch, seed):
+        image = (
+            torch.from_numpy(np.ascontiguousarray(image))
+            .permute(2, 0, 1)
+            .float()
+            .div_(255)
+        )
+        image, valid = rectify_image(
+            image, cam, roll, pitch if self.cfg.rectify_pitch else None
+        )
+        roll = 0.0
+        if self.cfg.rectify_pitch:
+            pitch = 0.0
+        if self.cfg.target_focal_length is not None:
+            # resize to a canonical focal length
+            factor = self.cfg.target_focal_length / cam.f.numpy()
+            size = (np.array(image.shape[-2:][::-1]) * factor).astype(int)
+            image, _, cam, valid = resize_image(image, size, camera=cam, valid=valid)
+            size_out = self.cfg.resize_image
+            if size_out is None:
+                # round the edges up such that they are multiple of a factor
+                stride = self.cfg.pad_to_multiple
+                size_out = (np.ceil((size / stride)) * stride).astype(int)
+            # crop or pad such that both edges are of the given size
+            image, valid, cam = pad_image(
+                image, size_out, cam, valid, crop_and_center=True
+            )
+        elif self.cfg.resize_image is not None:
+            image, _, cam, valid = resize_image(
+                image, self.cfg.resize_image, fn=max, camera=cam, valid=valid
+            )
+            if self.cfg.pad_to_square:
+                # pad such that both edges are of the given size
+                image, valid, cam = pad_image(image, self.cfg.resize_image, cam, valid)
+        if self.cfg.reduce_fov is not None:
+            h, w = image.shape[-2:]
+            f = float(cam.f[0])
+            fov = np.arctan(w / f / 2)
+            w_new = round(2 * f * np.tan(self.cfg.reduce_fov * fov))
+            image, valid, cam = pad_image(
+                image, (w_new, h), cam, valid, crop_and_center=True
+            )
+        with torch.random.fork_rng(devices=[]):
+            torch.manual_seed(seed)
+            image = self.tfs(image)
+        return image, valid, cam, roll, pitch
+    def create_map_mask(self, canvas):
+        map_mask = np.zeros(canvas.raster.shape[-2:], bool)
+        radius = self.cfg.mask_radius or self.cfg.max_init_error
+        mask_min, mask_max = np.round(
+            canvas.to_uv(canvas.bbox.center)
+            + np.array([[-1], [1]]) * (radius + self.cfg.mask_pad) * canvas.ppm
+        ).astype(int)
+        map_mask[mask_min[1] : mask_max[1], mask_min[0] : mask_max[0]] = True
+        return map_mask

maploc/data/image.py ADDED Viewed

	@@ -0,0 +1,140 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Callable, Optional, Union, Sequence
+import numpy as np
+import torch
+import torchvision.transforms.functional as tvf
+import collections
+from scipy.spatial.transform import Rotation
+from ..utils.geometry import from_homogeneous, to_homogeneous
+from ..utils.wrappers import Camera
+def rectify_image(
+    image: torch.Tensor,
+    cam: Camera,
+    roll: float,
+    pitch: Optional[float] = None,
+    valid: Optional[torch.Tensor] = None,
+):
+    *_, h, w = image.shape
+    grid = torch.meshgrid(
+        [torch.arange(w, device=image.device), torch.arange(h, device=image.device)],
+        indexing="xy",
+    )
+    grid = torch.stack(grid, -1).to(image.dtype)
+    if pitch is not None:
+        args = ("ZX", (roll, pitch))
+    else:
+        args = ("Z", roll)
+    R = Rotation.from_euler(*args, degrees=True).as_matrix()
+    R = torch.from_numpy(R).to(image)
+    grid_rect = to_homogeneous(cam.normalize(grid)) @ R.T
+    grid_rect = cam.denormalize(from_homogeneous(grid_rect))
+    grid_norm = (grid_rect + 0.5) / grid.new_tensor([w, h]) * 2 - 1
+    rectified = torch.nn.functional.grid_sample(
+        image[None],
+        grid_norm[None],
+        align_corners=False,
+        mode="bilinear",
+    ).squeeze(0)
+    if valid is None:
+        valid = torch.all((grid_norm >= -1) & (grid_norm <= 1), -1)
+    else:
+        valid = (
+            torch.nn.functional.grid_sample(
+                valid[None, None].float(),
+                grid_norm[None],
+                align_corners=False,
+                mode="nearest",
+            )[0, 0]
+            > 0
+        )
+    return rectified, valid
+def resize_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    fn: Optional[Callable] = None,
+    camera: Optional[Camera] = None,
+    valid: np.ndarray = None,
+):
+    """Resize an image to a fixed size, or according to max or min edge."""
+    *_, h, w = image.shape
+    if fn is not None:
+        assert isinstance(size, int)
+        scale = size / fn(h, w)
+        h_new, w_new = int(round(h * scale)), int(round(w * scale))
+        scale = (scale, scale)
+    else:
+        if isinstance(size, (collections.abc.Sequence, np.ndarray)):
+            w_new, h_new = size
+        elif isinstance(size, int):
+            w_new = h_new = size
+        else:
+            raise ValueError(f"Incorrect new size: {size}")
+        scale = (w_new / w, h_new / h)
+    if (w, h) != (w_new, h_new):
+        mode = tvf.InterpolationMode.BILINEAR
+        image = tvf.resize(image, (h_new, w_new), interpolation=mode, antialias=True)
+        image.clip_(0, 1)
+        if camera is not None:
+            camera = camera.scale(scale)
+        if valid is not None:
+            valid = tvf.resize(
+                valid.unsqueeze(0),
+                (h_new, w_new),
+                interpolation=tvf.InterpolationMode.NEAREST,
+            ).squeeze(0)
+    ret = [image, scale]
+    if camera is not None:
+        ret.append(camera)
+    if valid is not None:
+        ret.append(valid)
+    return ret
+def pad_image(
+    image: torch.Tensor,
+    size: Union[int, Sequence, np.ndarray],
+    camera: Optional[Camera] = None,
+    valid: torch.Tensor = None,
+    crop_and_center: bool = False,
+):
+    if isinstance(size, int):
+        w_new = h_new = size
+    elif isinstance(size, (collections.abc.Sequence, np.ndarray)):
+        w_new, h_new = size
+    else:
+        raise ValueError(f"Incorrect new size: {size}")
+    *c, h, w = image.shape
+    if crop_and_center:
+        diff = np.array([w - w_new, h - h_new])
+        left, top = left_top = np.round(diff / 2).astype(int)
+        right, bottom = diff - left_top
+    else:
+        assert h <= h_new
+        assert w <= w_new
+        top = bottom = left = right = 0
+    slice_out = np.s_[..., : min(h, h_new), : min(w, w_new)]
+    slice_in = np.s_[
+        ..., max(top, 0) : h - max(bottom, 0), max(left, 0) : w - max(right, 0)
+    ]
+    if (w, h) == (w_new, h_new):
+        out = image
+    else:
+        out = torch.zeros((*c, h_new, w_new), dtype=image.dtype)
+        out[slice_out] = image[slice_in]
+        if camera is not None:
+            camera = camera.crop((max(left, 0), max(top, 0)), (w_new, h_new))
+    out_valid = torch.zeros((h_new, w_new), dtype=torch.bool)
+    out_valid[slice_out] = True if valid is None else valid[slice_in]
+    if camera is not None:
+        return out, out_valid, camera
+    else:
+        return out, out_valid

maploc/data/kitti/dataset.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import collections
+import collections.abc
+from collections import defaultdict
+from pathlib import Path
+from typing import Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.utils.data as torchdata
+from omegaconf import OmegaConf
+from scipy.spatial.transform import Rotation
+from ... import logger, DATASETS_PATH
+from ...osm.tiling import TileManager
+from ..dataset import MapLocDataset
+from ..sequential import chunk_sequence
+from ..torch import collate, worker_init_fn
+from .utils import parse_split_file, parse_gps_file, get_camera_calibration
+class KittiDataModule(pl.LightningDataModule):
+    default_cfg = {
+        **MapLocDataset.default_cfg,
+        "name": "kitti",
+        # paths and fetch
+        "data_dir": DATASETS_PATH / "kitti",
+        "tiles_filename": "tiles.pkl",
+        "splits": {
+            "train": "train_files.txt",
+            "val": "test1_files.txt",
+            "test": "test2_files.txt",
+        },
+        "loading": {
+            "train": "???",
+            "val": "${.test}",
+            "test": {"batch_size": 1, "num_workers": 0},
+        },
+        "max_num_val": 500,
+        "selection_subset_val": "furthest",
+        "drop_train_too_close_to_val": 5.0,
+        "skip_frames": 1,
+        "camera_index": 2,
+        # overwrite
+        "crop_size_meters": 64,
+        "max_init_error": 20,
+        "max_init_error_rotation": 10,
+        "add_map_mask": True,
+        "mask_pad": 2,
+        "target_focal_length": 256,
+    }
+    dummy_scene_name = "kitti"
+    def __init__(self, cfg, tile_manager: Optional[TileManager] = None):
+        super().__init__()
+        default_cfg = OmegaConf.create(self.default_cfg)
+        OmegaConf.set_struct(default_cfg, True)  # cannot add new keys
+        self.cfg = OmegaConf.merge(default_cfg, cfg)
+        self.root = Path(self.cfg.data_dir)
+        self.tile_manager = tile_manager
+        if self.cfg.crop_size_meters < self.cfg.max_init_error:
+            raise ValueError("The ground truth location can be outside the map.")
+        assert self.cfg.selection_subset_val in ["random", "furthest"]
+        self.splits = {}
+        self.shifts = {}
+        self.calibrations = {}
+        self.data = {}
+        self.image_paths = {}
+    def prepare_data(self):
+        if not (self.root.exists() and (self.root / ".downloaded").exists()):
+            raise FileNotFoundError(
+                "Cannot find the KITTI dataset, run maploc.data.kitti.prepare"
+            )
+    def parse_split(self, split_arg):
+        if isinstance(split_arg, str):
+            names, shifts = parse_split_file(self.root / split_arg)
+        elif isinstance(split_arg, collections.abc.Sequence):
+            names = []
+            shifts = None
+            for date_drive in split_arg:
+                data_dir = (
+                    self.root / date_drive / f"image_{self.cfg.camera_index:02}/data"
+                )
+                assert data_dir.exists(), data_dir
+                date_drive = tuple(date_drive.split("/"))
+                n = sorted(date_drive + (p.name,) for p in data_dir.glob("*.png"))
+                names.extend(n[:: self.cfg.skip_frames])
+        else:
+            raise ValueError(split_arg)
+        return names, shifts
+    def setup(self, stage: Optional[str] = None):
+        if stage == "fit":
+            stages = ["train", "val"]
+        elif stage is None:
+            stages = ["train", "val", "test"]
+        else:
+            stages = [stage]
+        for stage in stages:
+            self.splits[stage], self.shifts[stage] = self.parse_split(
+                self.cfg.splits[stage]
+            )
+        do_val_subset = "val" in stages and self.cfg.max_num_val is not None
+        if do_val_subset and self.cfg.selection_subset_val == "random":
+            select = np.random.RandomState(self.cfg.seed).choice(
+                len(self.splits["val"]), self.cfg.max_num_val, replace=False
+            )
+            self.splits["val"] = [self.splits["val"][i] for i in select]
+            if self.shifts["val"] is not None:
+                self.shifts["val"] = self.shifts["val"][select]
+        dates = {d for ns in self.splits.values() for d, _, _ in ns}
+        for d in dates:
+            self.calibrations[d] = get_camera_calibration(
+                self.root / d, self.cfg.camera_index
+            )
+        if self.tile_manager is None:
+            logger.info("Loading the tile manager...")
+            self.tile_manager = TileManager.load(self.root / self.cfg.tiles_filename)
+        self.cfg.num_classes = {k: len(g) for k, g in self.tile_manager.groups.items()}
+        self.cfg.pixel_per_meter = self.tile_manager.ppm
+        # pack all attributes in a single tensor to optimize memory access
+        self.pack_data(stages)
+        dists = None
+        if do_val_subset and self.cfg.selection_subset_val == "furthest":
+            dists = torch.cdist(
+                self.data["val"]["t_c2w"][:, :2].double(),
+                self.data["train"]["t_c2w"][:, :2].double(),
+            )
+            min_dists = dists.min(1).values
+            select = torch.argsort(min_dists)[-self.cfg.max_num_val :]
+            dists = dists[select]
+            self.splits["val"] = [self.splits["val"][i] for i in select]
+            if self.shifts["val"] is not None:
+                self.shifts["val"] = self.shifts["val"][select]
+            for k in list(self.data["val"]):
+                if k != "cameras":
+                    self.data["val"][k] = self.data["val"][k][select]
+            self.image_paths["val"] = self.image_paths["val"][select]
+        if "train" in stages and self.cfg.drop_train_too_close_to_val is not None:
+            if dists is None:
+                dists = torch.cdist(
+                    self.data["val"]["t_c2w"][:, :2].double(),
+                    self.data["train"]["t_c2w"][:, :2].double(),
+                )
+            drop = torch.any(dists < self.cfg.drop_train_too_close_to_val, 0)
+            select = torch.where(~drop)[0]
+            logger.info(
+                "Dropping %d (%f %%) images that are too close to validation images.",
+                drop.sum(),
+                drop.float().mean(),
+            )
+            self.splits["train"] = [self.splits["train"][i] for i in select]
+            if self.shifts["train"] is not None:
+                self.shifts["train"] = self.shifts["train"][select]
+            for k in list(self.data["train"]):
+                if k != "cameras":
+                    self.data["train"][k] = self.data["train"][k][select]
+            self.image_paths["train"] = self.image_paths["train"][select]
+    def pack_data(self, stages):
+        for stage in stages:
+            names = []
+            data = {}
+            for i, (date, drive, index) in enumerate(self.splits[stage]):
+                d = self.get_frame_data(date, drive, index)
+                for k, v in d.items():
+                    if i == 0:
+                        data[k] = []
+                    data[k].append(v)
+                path = f"{date}/{drive}/image_{self.cfg.camera_index:02}/data/{index}"
+                names.append((self.dummy_scene_name, f"{date}/{drive}", path))
+            for k in list(data):
+                data[k] = torch.from_numpy(np.stack(data[k]))
+            data["camera_id"] = np.full(len(names), self.cfg.camera_index)
+            sequences = {date_drive for _, date_drive, _ in names}
+            data["cameras"] = {
+                self.dummy_scene_name: {
+                    seq: {
+                        self.cfg.camera_index: self.calibrations[seq.split("/")[0]][0]
+                    }
+                    for seq in sequences
+                }
+            }
+            shifts = self.shifts[stage]
+            if shifts is not None:
+                data["shifts"] = torch.from_numpy(shifts.astype(np.float32))
+            self.data[stage] = data
+            self.image_paths[stage] = np.array(names)
+    def get_frame_data(self, date, drive, index):
+        _, R_cam_gps, t_cam_gps = self.calibrations[date]
+        # Transform the GPS pose to the camera pose
+        gps_path = (
+            self.root / date / drive / "oxts/data" / Path(index).with_suffix(".txt")
+        )
+        _, R_world_gps, t_world_gps = parse_gps_file(
+            gps_path, self.tile_manager.projection
+        )
+        R_world_cam = R_world_gps @ R_cam_gps.T
+        t_world_cam = t_world_gps - R_world_gps @ R_cam_gps.T @ t_cam_gps
+        # Some voodoo to extract correct Euler angles from R_world_cam
+        R_cv_xyz = Rotation.from_euler("YX", [-90, 90], degrees=True).as_matrix()
+        R_world_cam_xyz = R_world_cam @ R_cv_xyz
+        y, p, r = Rotation.from_matrix(R_world_cam_xyz).as_euler("ZYX", degrees=True)
+        roll, pitch, yaw = r, -p, 90 - y
+        roll_pitch_yaw = np.array([-roll, -pitch, yaw], np.float32)  # for some reason
+        return {
+            "t_c2w": t_world_cam.astype(np.float32),
+            "roll_pitch_yaw": roll_pitch_yaw,
+            "index": int(index.split(".")[0]),
+        }
+    def dataset(self, stage: str):
+        return MapLocDataset(
+            stage,
+            self.cfg,
+            self.image_paths[stage],
+            self.data[stage],
+            {self.dummy_scene_name: self.root},
+            {self.dummy_scene_name: self.tile_manager},
+        )
+    def dataloader(
+        self,
+        stage: str,
+        shuffle: bool = False,
+        num_workers: int = None,
+        sampler: Optional[torchdata.Sampler] = None,
+    ):
+        dataset = self.dataset(stage)
+        cfg = self.cfg["loading"][stage]
+        num_workers = cfg["num_workers"] if num_workers is None else num_workers
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=cfg["batch_size"],
+            num_workers=num_workers,
+            shuffle=shuffle or (stage == "train"),
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+            sampler=sampler,
+        )
+        return loader
+    def train_dataloader(self, **kwargs):
+        return self.dataloader("train", **kwargs)
+    def val_dataloader(self, **kwargs):
+        return self.dataloader("val", **kwargs)
+    def test_dataloader(self, **kwargs):
+        return self.dataloader("test", **kwargs)
+    def sequence_dataset(self, stage: str, **kwargs):
+        keys = self.image_paths[stage]
+        # group images by sequence (date/drive)
+        seq2indices = defaultdict(list)
+        for index, (_, date_drive, _) in enumerate(keys):
+            seq2indices[date_drive].append(index)
+        # chunk the sequences to the required length
+        chunk2indices = {}
+        for seq, indices in seq2indices.items():
+            chunks = chunk_sequence(
+                self.data[stage], indices, names=self.image_paths[stage], **kwargs
+            )
+            for i, sub_indices in enumerate(chunks):
+                chunk2indices[seq, i] = sub_indices
+        # store the index of each chunk in its sequence
+        chunk_indices = torch.full((len(keys),), -1)
+        for (_, chunk_index), idx in chunk2indices.items():
+            chunk_indices[idx] = chunk_index
+        self.data[stage]["chunk_index"] = chunk_indices
+        dataset = self.dataset(stage)
+        return dataset, chunk2indices
+    def sequence_dataloader(self, stage: str, shuffle: bool = False, **kwargs):
+        dataset, chunk2idx = self.sequence_dataset(stage, **kwargs)
+        seq_keys = sorted(chunk2idx)
+        if shuffle:
+            perm = torch.randperm(len(seq_keys))
+            seq_keys = [seq_keys[i] for i in perm]
+        key_indices = [i for key in seq_keys for i in chunk2idx[key]]
+        num_workers = self.cfg["loading"][stage]["num_workers"]
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=None,
+            sampler=key_indices,
+            num_workers=num_workers,
+            shuffle=False,
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+        )
+        return loader, seq_keys, chunk2idx

maploc/data/kitti/prepare.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import argparse
+from pathlib import Path
+import shutil
+import zipfile
+import numpy as np
+from tqdm.auto import tqdm
+from ... import logger
+from ...osm.tiling import TileManager
+from ...osm.viz import GeoPlotter
+from ...utils.geo import BoundaryBox, Projection
+from ...utils.io import download_file, DATA_URL
+from .utils import parse_gps_file
+from .dataset import KittiDataModule
+split_files = ["test1_files.txt", "test2_files.txt", "train_files.txt"]
+def prepare_osm(
+    data_dir,
+    osm_path,
+    output_path,
+    tile_margin=512,
+    ppm=2,
+):
+    all_latlon = []
+    for gps_path in data_dir.glob("2011_*/*/oxts/data/*.txt"):
+        all_latlon.append(parse_gps_file(gps_path)[0])
+    if not all_latlon:
+        raise ValueError(f"Cannot find any GPS file in {data_dir}.")
+    all_latlon = np.stack(all_latlon)
+    projection = Projection.from_points(all_latlon)
+    all_xy = projection.project(all_latlon)
+    bbox_map = BoundaryBox(all_xy.min(0), all_xy.max(0)) + tile_margin
+    plotter = GeoPlotter()
+    plotter.points(all_latlon, "red", name="GPS")
+    plotter.bbox(projection.unproject(bbox_map), "blue", "tiling bounding box")
+    plotter.fig.write_html(data_dir / "split_kitti.html")
+    tile_manager = TileManager.from_bbox(
+        projection,
+        bbox_map,
+        ppm,
+        path=osm_path,
+    )
+    tile_manager.save(output_path)
+    return tile_manager
+def download(data_dir: Path):
+    data_dir.mkdir(exist_ok=True, parents=True)
+    this_dir = Path(__file__).parent
+    seqs = set()
+    for f in split_files:
+        shutil.copy(this_dir / f, data_dir)
+        with open(this_dir / f, "r") as fid:
+            info = fid.read()
+        for line in info.split("\n"):
+            if line:
+                seq = line.split()[0].split("/")[1][: -len("_sync")]
+                seqs.add(seq)
+    dates = {"_".join(s.split("_")[:3]) for s in seqs}
+    logger.info("Downloading data for %d sequences in %d dates", len(seqs), len(dates))
+    for seq in tqdm(seqs):
+        logger.info("Working on %s.", seq)
+        date = "_".join(seq.split("_")[:3])
+        url = f"https://s3.eu-central-1.amazonaws.com/avg-kitti/raw_data/{seq}/{seq}_sync.zip"
+        seq_dir = data_dir / date / f"{seq}_sync"
+        if seq_dir.exists():
+            continue
+        zip_path = download_file(url, data_dir)
+        with zipfile.ZipFile(zip_path, "r") as z:
+            z.extractall(data_dir)
+        # Delete unused files to save space.
+        for image_index in [0, 1, 3]:
+            shutil.rmtree(seq_dir / f"image_0{image_index}")
+        shutil.rmtree(seq_dir / "velodyne_points")
+        Path(zip_path).unlink()
+        break
+    for date in tqdm(dates):
+        url = (
+            f"https://s3.eu-central-1.amazonaws.com/avg-kitti/raw_data/{date}_calib.zip"
+        )
+        zip_path = download_file(url, data_dir)
+        with zipfile.ZipFile(zip_path, "r") as z:
+            z.extractall(data_dir)
+        Path(zip_path).unlink()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--data_dir", type=Path, default=Path(KittiDataModule.default_cfg["local_dir"])
+    )
+    parser.add_argument("--pixel_per_meter", type=int, default=2)
+    parser.add_argument("--generate_tiles", action="store_true")
+    args = parser.parse_args()
+    args.data_dir.mkdir(exist_ok=True, parents=True)
+    download(args.data_dir)
+    tiles_path = args.data_dir / KittiDataModule.default_cfg["tiles_filename"]
+    if args.generate_tiles:
+        logger.info("Generating the map tiles.")
+        osm_filename = "karlsruhe.osm"
+        osm_path = args.data_dir / osm_filename
+        if not osm_path.exists():
+            logger.info("Downloading OSM raw data.")
+            download_file(DATA_URL + f"/osm/{osm_filename}", osm_path)
+        if not osm_path.exists():
+            raise FileNotFoundError(f"No OSM data file at {osm_path}.")
+        prepare_osm(args.data_dir, osm_path, tiles_path, ppm=args.pixel_per_meter)
+        (args.data_dir / ".downloaded").touch()
+    else:
+        logger.info("Downloading pre-generated map tiles.")
+        download_file(DATA_URL + "/tiles/kitti.pkl", tiles_path)

maploc/data/kitti/test1_files.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

maploc/data/kitti/test2_files.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

maploc/data/kitti/train_files.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

maploc/data/kitti/utils.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from pathlib import Path
+import numpy as np
+from scipy.spatial.transform import Rotation
+from ...utils.geo import Projection
+split_files = ["test1_files.txt", "test2_files.txt", "train_files.txt"]
+def parse_gps_file(path, projection: Projection = None):
+    with open(path, "r") as fid:
+        lat, lon, _, roll, pitch, yaw, *_ = map(float, fid.read().split())
+    latlon = np.array([lat, lon])
+    R_world_gps = Rotation.from_euler("ZYX", [yaw, pitch, roll]).as_matrix()
+    t_world_gps = None if projection is None else np.r_[projection.project(latlon), 0]
+    return latlon, R_world_gps, t_world_gps
+def parse_split_file(path: Path):
+    with open(path, "r") as fid:
+        info = fid.read()
+    names = []
+    shifts = []
+    for line in info.split("\n"):
+        if not line:
+            continue
+        name, *shift = line.split()
+        names.append(tuple(name.split("/")))
+        if len(shift) > 0:
+            assert len(shift) == 3
+            shifts.append(np.array(shift, float))
+    shifts = None if len(shifts) == 0 else np.stack(shifts)
+    return names, shifts
+def parse_calibration_file(path):
+    calib = {}
+    with open(path, "r") as fid:
+        for line in fid.read().split("\n"):
+            if not line:
+                continue
+            key, *data = line.split(" ")
+            key = key.rstrip(":")
+            if key.startswith("R"):
+                data = np.array(data, float).reshape(3, 3)
+            elif key.startswith("T"):
+                data = np.array(data, float).reshape(3)
+            elif key.startswith("P"):
+                data = np.array(data, float).reshape(3, 4)
+            calib[key] = data
+    return calib
+def get_camera_calibration(calib_dir, cam_index: int):
+    calib_path = calib_dir / "calib_cam_to_cam.txt"
+    calib_cam = parse_calibration_file(calib_path)
+    P = calib_cam[f"P_rect_{cam_index:02}"]
+    K = P[:3, :3]
+    size = np.array(calib_cam[f"S_rect_{cam_index:02}"], float).astype(int)
+    camera = {
+        "model": "PINHOLE",
+        "width": size[0],
+        "height": size[1],
+        "params": K[[0, 1, 0, 1], [0, 1, 2, 2]],
+    }
+    t_cam_cam0 = P[:3, 3] / K[[0, 1, 2], [0, 1, 2]]
+    R_rect_cam0 = calib_cam["R_rect_00"]
+    calib_gps_velo = parse_calibration_file(calib_dir / "calib_imu_to_velo.txt")
+    calib_velo_cam0 = parse_calibration_file(calib_dir / "calib_velo_to_cam.txt")
+    R_cam0_gps = calib_velo_cam0["R"] @ calib_gps_velo["R"]
+    t_cam0_gps = calib_velo_cam0["R"] @ calib_gps_velo["T"] + calib_velo_cam0["T"]
+    R_cam_gps = R_rect_cam0 @ R_cam0_gps
+    t_cam_gps = t_cam_cam0 + R_rect_cam0 @ t_cam0_gps
+    return camera, R_cam_gps, t_cam_gps

maploc/data/mapillary/dataset.py ADDED Viewed

	@@ -0,0 +1,350 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+from collections import defaultdict
+import os
+import shutil
+import tarfile
+from pathlib import Path
+from typing import Any, Dict, Optional
+import numpy as np
+import pytorch_lightning as pl
+import torch
+import torch.utils.data as torchdata
+from omegaconf import DictConfig, OmegaConf
+from ... import logger, DATASETS_PATH
+from ...osm.tiling import TileManager
+from ..dataset import MapLocDataset
+from ..sequential import chunk_sequence
+from ..torch import collate, worker_init_fn
+def pack_dump_dict(dump):
+    for per_seq in dump.values():
+        if "points" in per_seq:
+            for chunk in list(per_seq["points"]):
+                points = per_seq["points"].pop(chunk)
+                if points is not None:
+                    per_seq["points"][chunk] = np.array(
+                        per_seq["points"][chunk], np.float64
+                    )
+        for view in per_seq["views"].values():
+            for k in ["R_c2w", "roll_pitch_yaw"]:
+                view[k] = np.array(view[k], np.float32)
+            for k in ["chunk_id"]:
+                if k in view:
+                    view.pop(k)
+        if "observations" in view:
+            view["observations"] = np.array(view["observations"])
+        for camera in per_seq["cameras"].values():
+            for k in ["params"]:
+                camera[k] = np.array(camera[k], np.float32)
+    return dump
+class MapillaryDataModule(pl.LightningDataModule):
+    dump_filename = "dump.json"
+    images_archive = "images.tar.gz"
+    images_dirname = "images/"
+    default_cfg = {
+        **MapLocDataset.default_cfg,
+        "name": "mapillary",
+        # paths and fetch
+        "data_dir": DATASETS_PATH / "MGL",
+        "local_dir": None,
+        "tiles_filename": "tiles.pkl",
+        "scenes": "???",
+        "split": None,
+        "loading": {
+            "train": "???",
+            "val": "${.test}",
+            "test": {"batch_size": 1, "num_workers": 0},
+        },
+        "filter_for": None,
+        "filter_by_ground_angle": None,
+        "min_num_points": "???",
+    }
+    def __init__(self, cfg: Dict[str, Any]):
+        super().__init__()
+        default_cfg = OmegaConf.create(self.default_cfg)
+        OmegaConf.set_struct(default_cfg, True)  # cannot add new keys
+        self.cfg = OmegaConf.merge(default_cfg, cfg)
+        self.root = Path(self.cfg.data_dir)
+        self.local_dir = self.cfg.local_dir or os.environ.get("TMPDIR")
+        if self.local_dir is not None:
+            self.local_dir = Path(self.local_dir, "MGL")
+        if self.cfg.crop_size_meters < self.cfg.max_init_error:
+            raise ValueError("The ground truth location can be outside the map.")
+    def prepare_data(self):
+        for scene in self.cfg.scenes:
+            dump_dir = self.root / scene
+            assert (dump_dir / self.dump_filename).exists(), dump_dir
+            assert (dump_dir / self.cfg.tiles_filename).exists(), dump_dir
+            if self.local_dir is None:
+                assert (dump_dir / self.images_dirname).exists(), dump_dir
+                continue
+            # Cache the folder of images locally to speed up reading
+            local_dir = self.local_dir / scene
+            if local_dir.exists():
+                shutil.rmtree(local_dir)
+            local_dir.mkdir(exist_ok=True, parents=True)
+            images_archive = dump_dir / self.images_archive
+            logger.info("Extracting the image archive %s.", images_archive)
+            with tarfile.open(images_archive) as fp:
+                fp.extractall(local_dir)
+    def setup(self, stage: Optional[str] = None):
+        self.dumps = {}
+        self.tile_managers = {}
+        self.image_dirs = {}
+        names = []
+        for scene in self.cfg.scenes:
+            logger.info("Loading scene %s.", scene)
+            dump_dir = self.root / scene
+            logger.info("Loading map tiles %s.", self.cfg.tiles_filename)
+            self.tile_managers[scene] = TileManager.load(
+                dump_dir / self.cfg.tiles_filename
+            )
+            groups = self.tile_managers[scene].groups
+            if self.cfg.num_classes:  # check consistency
+                if set(groups.keys()) != set(self.cfg.num_classes.keys()):
+                    raise ValueError(
+                        f"Inconsistent groups: {groups.keys()} {self.cfg.num_classes.keys()}"
+                    )
+                for k in groups:
+                    if len(groups[k]) != self.cfg.num_classes[k]:
+                        raise ValueError(
+                            f"{k}: {len(groups[k])} vs {self.cfg.num_classes[k]}"
+                        )
+            ppm = self.tile_managers[scene].ppm
+            if ppm != self.cfg.pixel_per_meter:
+                raise ValueError(
+                    "The tile manager and the config/model have different ground resolutions: "
+                    f"{ppm} vs {self.cfg.pixel_per_meter}"
+                )
+            logger.info("Loading dump json file %s.", self.dump_filename)
+            with (dump_dir / self.dump_filename).open("r") as fp:
+                self.dumps[scene] = pack_dump_dict(json.load(fp))
+            for seq, per_seq in self.dumps[scene].items():
+                for cam_id, cam_dict in per_seq["cameras"].items():
+                    if cam_dict["model"] != "PINHOLE":
+                        raise ValueError(
+                            f"Unsupported camera model: {cam_dict['model']} for {scene},{seq},{cam_id}"
+                        )
+            self.image_dirs[scene] = (
+                (self.local_dir or self.root) / scene / self.images_dirname
+            )
+            assert self.image_dirs[scene].exists(), self.image_dirs[scene]
+            for seq, data in self.dumps[scene].items():
+                for name in data["views"]:
+                    names.append((scene, seq, name))
+        self.parse_splits(self.cfg.split, names)
+        if self.cfg.filter_for is not None:
+            self.filter_elements()
+        self.pack_data()
+    def pack_data(self):
+        # We pack the data into compact tensors that can be shared across processes without copy
+        exclude = {
+            "compass_angle",
+            "compass_accuracy",
+            "gps_accuracy",
+            "chunk_key",
+            "panorama_offset",
+        }
+        cameras = {
+            scene: {seq: per_seq["cameras"] for seq, per_seq in per_scene.items()}
+            for scene, per_scene in self.dumps.items()
+        }
+        points = {
+            scene: {
+                seq: {
+                    i: torch.from_numpy(p) for i, p in per_seq.get("points", {}).items()
+                }
+                for seq, per_seq in per_scene.items()
+            }
+            for scene, per_scene in self.dumps.items()
+        }
+        self.data = {}
+        for stage, names in self.splits.items():
+            view = self.dumps[names[0][0]][names[0][1]]["views"][names[0][2]]
+            data = {k: [] for k in view.keys() - exclude}
+            for scene, seq, name in names:
+                for k in data:
+                    data[k].append(self.dumps[scene][seq]["views"][name].get(k, None))
+            for k in data:
+                v = np.array(data[k])
+                if np.issubdtype(v.dtype, np.integer) or np.issubdtype(
+                    v.dtype, np.floating
+                ):
+                    v = torch.from_numpy(v)
+                data[k] = v
+            data["cameras"] = cameras
+            data["points"] = points
+            self.data[stage] = data
+            self.splits[stage] = np.array(names)
+    def filter_elements(self):
+        for stage, names in self.splits.items():
+            names_select = []
+            for scene, seq, name in names:
+                view = self.dumps[scene][seq]["views"][name]
+                if self.cfg.filter_for == "ground_plane":
+                    if not (1.0 <= view["height"] <= 3.0):
+                        continue
+                    planes = self.dumps[scene][seq].get("plane")
+                    if planes is not None:
+                        inliers = planes[str(view["chunk_id"])][-1]
+                        if inliers < 10:
+                            continue
+                    if self.cfg.filter_by_ground_angle is not None:
+                        plane = np.array(view["plane_params"])
+                        normal = plane[:3] / np.linalg.norm(plane[:3])
+                        angle = np.rad2deg(np.arccos(np.abs(normal[-1])))
+                        if angle > self.cfg.filter_by_ground_angle:
+                            continue
+                elif self.cfg.filter_for == "pointcloud":
+                    if len(view["observations"]) < self.cfg.min_num_points:
+                        continue
+                elif self.cfg.filter_for is not None:
+                    raise ValueError(f"Unknown filtering: {self.cfg.filter_for}")
+                names_select.append((scene, seq, name))
+            logger.info(
+                "%s: Keep %d/%d images after filtering for %s.",
+                stage,
+                len(names_select),
+                len(names),
+                self.cfg.filter_for,
+            )
+            self.splits[stage] = names_select
+    def parse_splits(self, split_arg, names):
+        if split_arg is None:
+            self.splits = {
+                "train": names,
+                "val": names,
+            }
+        elif isinstance(split_arg, int):
+            names = np.random.RandomState(self.cfg.seed).permutation(names).tolist()
+            self.splits = {
+                "train": names[split_arg:],
+                "val": names[:split_arg],
+            }
+        elif isinstance(split_arg, DictConfig):
+            scenes_val = set(split_arg.val)
+            scenes_train = set(split_arg.train)
+            assert len(scenes_val - set(self.cfg.scenes)) == 0
+            assert len(scenes_train - set(self.cfg.scenes)) == 0
+            self.splits = {
+                "train": [n for n in names if n[0] in scenes_train],
+                "val": [n for n in names if n[0] in scenes_val],
+            }
+        elif isinstance(split_arg, str):
+            with (self.root / split_arg).open("r") as fp:
+                splits = json.load(fp)
+            splits = {
+                k: {loc: set(ids) for loc, ids in split.items()}
+                for k, split in splits.items()
+            }
+            self.splits = {}
+            for k, split in splits.items():
+                self.splits[k] = [
+                    n
+                    for n in names
+                    if n[0] in split and int(n[-1].rsplit("_", 1)[0]) in split[n[0]]
+                ]
+        else:
+            raise ValueError(split_arg)
+    def dataset(self, stage: str):
+        return MapLocDataset(
+            stage,
+            self.cfg,
+            self.splits[stage],
+            self.data[stage],
+            self.image_dirs,
+            self.tile_managers,
+            image_ext=".jpg",
+        )
+    def dataloader(
+        self,
+        stage: str,
+        shuffle: bool = False,
+        num_workers: int = None,
+        sampler: Optional[torchdata.Sampler] = None,
+    ):
+        dataset = self.dataset(stage)
+        cfg = self.cfg["loading"][stage]
+        num_workers = cfg["num_workers"] if num_workers is None else num_workers
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=cfg["batch_size"],
+            num_workers=num_workers,
+            shuffle=shuffle or (stage == "train"),
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+            sampler=sampler,
+        )
+        return loader
+    def train_dataloader(self, **kwargs):
+        return self.dataloader("train", **kwargs)
+    def val_dataloader(self, **kwargs):
+        return self.dataloader("val", **kwargs)
+    def test_dataloader(self, **kwargs):
+        return self.dataloader("test", **kwargs)
+    def sequence_dataset(self, stage: str, **kwargs):
+        keys = self.splits[stage]
+        seq2indices = defaultdict(list)
+        for index, (_, seq, _) in enumerate(keys):
+            seq2indices[seq].append(index)
+        # chunk the sequences to the required length
+        chunk2indices = {}
+        for seq, indices in seq2indices.items():
+            chunks = chunk_sequence(self.data[stage], indices, **kwargs)
+            for i, sub_indices in enumerate(chunks):
+                chunk2indices[seq, i] = sub_indices
+        # store the index of each chunk in its sequence
+        chunk_indices = torch.full((len(keys),), -1)
+        for (_, chunk_index), idx in chunk2indices.items():
+            chunk_indices[idx] = chunk_index
+        self.data[stage]["chunk_index"] = chunk_indices
+        dataset = self.dataset(stage)
+        return dataset, chunk2indices
+    def sequence_dataloader(self, stage: str, shuffle: bool = False, **kwargs):
+        dataset, chunk2idx = self.sequence_dataset(stage, **kwargs)
+        chunk_keys = sorted(chunk2idx)
+        if shuffle:
+            perm = torch.randperm(len(chunk_keys))
+            chunk_keys = [chunk_keys[i] for i in perm]
+        key_indices = [i for key in chunk_keys for i in chunk2idx[key]]
+        num_workers = self.cfg["loading"][stage]["num_workers"]
+        loader = torchdata.DataLoader(
+            dataset,
+            batch_size=None,
+            sampler=key_indices,
+            num_workers=num_workers,
+            shuffle=False,
+            pin_memory=True,
+            persistent_workers=num_workers > 0,
+            worker_init_fn=worker_init_fn,
+            collate_fn=collate,
+        )
+        return loader, chunk_keys, chunk2idx

maploc/data/mapillary/download.py ADDED Viewed

	@@ -0,0 +1,180 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import json
+from pathlib import Path
+import numpy as np
+import httpx
+import asyncio
+from aiolimiter import AsyncLimiter
+import tqdm
+from opensfm.pygeometry import Camera, Pose
+from opensfm.pymap import Shot
+from ... import logger
+from ...utils.geo import Projection
+semaphore = asyncio.Semaphore(100)  # number of parallel threads.
+image_filename = "{image_id}.jpg"
+info_filename = "{image_id}.json"
+class MapillaryDownloader:
+    image_fields = (
+        "id",
+        "height",
+        "width",
+        "camera_parameters",
+        "camera_type",
+        "captured_at",
+        "compass_angle",
+        "geometry",
+        "altitude",
+        "computed_compass_angle",
+        "computed_geometry",
+        "computed_altitude",
+        "computed_rotation",
+        "thumb_2048_url",
+        "thumb_original_url",
+        "sequence",
+        "sfm_cluster",
+    )
+    image_info_url = (
+        "https://graph.mapillary.com/{image_id}?access_token={token}&fields={fields}"
+    )
+    seq_info_url = "https://graph.mapillary.com/image_ids?access_token={token}&sequence_id={seq_id}"
+    max_requests_per_minute = 50_000
+    def __init__(self, token: str):
+        self.token = token
+        self.client = httpx.AsyncClient(
+            transport=httpx.AsyncHTTPTransport(retries=20), timeout=20.0
+        )
+        self.limiter = AsyncLimiter(self.max_requests_per_minute // 2, time_period=60)
+    async def call_api(self, url: str):
+        async with self.limiter:
+            r = await self.client.get(url)
+        if not r.is_success:
+            logger.error("Error in API call: %s", r.text)
+        return r
+    async def get_image_info(self, image_id: int):
+        url = self.image_info_url.format(
+            image_id=image_id,
+            token=self.token,
+            fields=",".join(self.image_fields),
+        )
+        r = await self.call_api(url)
+        if r.is_success:
+            return json.loads(r.text)
+    async def get_sequence_info(self, seq_id: str):
+        url = self.seq_info_url.format(seq_id=seq_id, token=self.token)
+        r = await self.call_api(url)
+        if r.is_success:
+            return json.loads(r.text)
+    async def download_image_pixels(self, url: str, path: Path):
+        r = await self.call_api(url)
+        if r.is_success:
+            with open(path, "wb") as fid:
+                fid.write(r.content)
+        return r.is_success
+    async def get_image_info_cached(self, image_id: int, path: Path):
+        if path.exists():
+            info = json.loads(path.read_text())
+        else:
+            info = await self.get_image_info(image_id)
+            path.write_text(json.dumps(info))
+        return info
+    async def download_image_pixels_cached(self, url: str, path: Path):
+        if path.exists():
+            return True
+        else:
+            return await self.download_image_pixels(url, path)
+async def fetch_images_in_sequence(i, downloader):
+    async with semaphore:
+        info = await downloader.get_sequence_info(i)
+    image_ids = [int(d["id"]) for d in info["data"]]
+    return i, image_ids
+async def fetch_images_in_sequences(sequence_ids, downloader):
+    seq_to_images_ids = {}
+    tasks = [fetch_images_in_sequence(i, downloader) for i in sequence_ids]
+    for task in tqdm.asyncio.tqdm.as_completed(tasks):
+        i, image_ids = await task
+        seq_to_images_ids[i] = image_ids
+    return seq_to_images_ids
+async def fetch_image_info(i, downloader, dir_):
+    async with semaphore:
+        path = dir_ / info_filename.format(image_id=i)
+        info = await downloader.get_image_info_cached(i, path)
+    return i, info
+async def fetch_image_infos(image_ids, downloader, dir_):
+    infos = {}
+    num_fail = 0
+    tasks = [fetch_image_info(i, downloader, dir_) for i in image_ids]
+    for task in tqdm.asyncio.tqdm.as_completed(tasks):
+        i, info = await task
+        if info is None:
+            num_fail += 1
+        else:
+            infos[i] = info
+    return infos, num_fail
+async def fetch_image_pixels(i, url, downloader, dir_, overwrite=False):
+    async with semaphore:
+        path = dir_ / image_filename.format(image_id=i)
+        if overwrite:
+            path.unlink(missing_ok=True)
+        success = await downloader.download_image_pixels_cached(url, path)
+    return i, success
+async def fetch_images_pixels(image_urls, downloader, dir_):
+    num_fail = 0
+    tasks = [fetch_image_pixels(*id_url, downloader, dir_) for id_url in image_urls]
+    for task in tqdm.asyncio.tqdm.as_completed(tasks):
+        i, success = await task
+        num_fail += not success
+    return num_fail
+def opensfm_camera_from_info(info: dict) -> Camera:
+    cam_type = info["camera_type"]
+    if cam_type == "perspective":
+        camera = Camera.create_perspective(*info["camera_parameters"])
+    elif cam_type == "fisheye":
+        camera = Camera.create_fisheye(*info["camera_parameters"])
+    elif Camera.is_panorama(cam_type):
+        camera = Camera.create_spherical()
+    else:
+        raise ValueError(cam_type)
+    camera.width = info["width"]
+    camera.height = info["height"]
+    camera.id = info["id"]
+    return camera
+def opensfm_shot_from_info(info: dict, projection: Projection) -> Shot:
+    latlong = info["computed_geometry"]["coordinates"][::-1]
+    alt = info["computed_altitude"]
+    xyz = projection.project(np.array([*latlong, alt]), return_z=True)
+    c_rotvec_w = np.array(info["computed_rotation"])
+    pose = Pose()
+    pose.set_from_cam_to_world(-c_rotvec_w, xyz)
+    camera = opensfm_camera_from_info(info)
+    return latlong, Shot(info["id"], camera, pose)

maploc/data/mapillary/prepare.py ADDED Viewed

	@@ -0,0 +1,406 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import asyncio
+import argparse
+from collections import defaultdict
+import json
+import shutil
+from pathlib import Path
+from typing import List
+import numpy as np
+import cv2
+from tqdm import tqdm
+from tqdm.contrib.concurrent import thread_map
+from omegaconf import DictConfig, OmegaConf
+from opensfm.pygeometry import Camera
+from opensfm.pymap import Shot
+from opensfm.undistort import (
+    perspective_camera_from_fisheye,
+    perspective_camera_from_perspective,
+)
+from ... import logger
+from ...osm.tiling import TileManager
+from ...osm.viz import GeoPlotter
+from ...utils.geo import BoundaryBox, Projection
+from ...utils.io import write_json, download_file, DATA_URL
+from ..utils import decompose_rotmat
+from .utils import (
+    keyframe_selection,
+    perspective_camera_from_pano,
+    scale_camera,
+    CameraUndistorter,
+    PanoramaUndistorter,
+    undistort_shot,
+)
+from .download import (
+    MapillaryDownloader,
+    opensfm_shot_from_info,
+    image_filename,
+    fetch_image_infos,
+    fetch_images_pixels,
+)
+from .dataset import MapillaryDataModule
+location_to_params = {
+    "sanfrancisco_soma": {
+        "bbox": BoundaryBox(
+            [-122.410307, 37.770364][::-1], [-122.388772, 37.795545][::-1]
+        ),
+        "camera_models": ["GoPro Max"],
+        "osm_file": "sanfrancisco.osm",
+    },
+    "sanfrancisco_hayes": {
+        "bbox": BoundaryBox(
+            [-122.438415, 37.768634][::-1], [-122.410605, 37.783894][::-1]
+        ),
+        "camera_models": ["GoPro Max"],
+        "osm_file": "sanfrancisco.osm",
+    },
+    "amsterdam": {
+        "bbox": BoundaryBox([4.845284, 52.340679][::-1], [4.926147, 52.386299][::-1]),
+        "camera_models": ["GoPro Max"],
+        "osm_file": "amsterdam.osm",
+    },
+    "lemans": {
+        "bbox": BoundaryBox([0.185752, 47.995125][::-1], [0.224088, 48.014209][::-1]),
+        "owners": ["xXOocM1jUB4jaaeukKkmgw"],  # sogefi
+        "osm_file": "lemans.osm",
+    },
+    "berlin": {
+        "bbox": BoundaryBox([13.416271, 52.459656][::-1], [13.469829, 52.499195][::-1]),
+        "owners": ["LT3ajUxH6qsosamrOHIrFw"],  # supaplex030
+        "osm_file": "berlin.osm",
+    },
+    "montrouge": {
+        "bbox": BoundaryBox([2.298958, 48.80874][::-1], [2.332989, 48.825276][::-1]),
+        "owners": [
+            "XtzGKZX2_VIJRoiJ8IWRNQ",
+            "C4ENdWpJdFNf8CvnQd7NrQ",
+            "e_ZBE6mFd7CYNjRSpLl-Lg",
+        ],  # overflorian, phyks, francois2
+        "camera_models": ["LG-R105"],
+        "osm_file": "paris.osm",
+    },
+    "nantes": {
+        "bbox": BoundaryBox([-1.585839, 47.198289][::-1], [-1.51318, 47.236161][::-1]),
+        "owners": [
+            "jGdq3CL-9N-Esvj3mtCWew",
+            "s-j5BH9JRIzsgORgaJF3aA",
+        ],  # c_mobilite, cartocite
+        "osm_file": "nantes.osm",
+    },
+    "toulouse": {
+        "bbox": BoundaryBox([1.429457, 43.591434][::-1], [1.456653, 43.61343][::-1]),
+        "owners": ["MNkhq6MCoPsdQNGTMh3qsQ"],  # tyndare
+        "osm_file": "toulouse.osm",
+    },
+    "vilnius": {
+        "bbox": BoundaryBox([25.258633, 54.672956][::-1], [25.296094, 54.696755][::-1]),
+        "owners": ["bClduFF6Gq16cfwCdhWivw", "u5ukBseATUS8jUbtE43fcO"],  # kedas, vms
+        "osm_file": "vilnius.osm",
+    },
+    "helsinki": {
+        "bbox": BoundaryBox(
+            [24.8975480117, 60.1449128318][::-1], [24.9816543235, 60.1770977471][::-1]
+        ),
+        "camera_types": ["spherical", "equirectangular"],
+        "osm_file": "helsinki.osm",
+    },
+    "milan": {
+        "bbox": BoundaryBox(
+            [9.1732723899, 45.4810977947][::-1],
+            [9.2255987917, 45.5284238563][::-1],
+        ),
+        "camera_types": ["spherical", "equirectangular"],
+        "osm_file": "milan.osm",
+    },
+    "avignon": {
+        "bbox": BoundaryBox(
+            [4.7887045302, 43.9416178156][::-1], [4.8227015622, 43.9584848909][::-1]
+        ),
+        "camera_types": ["spherical", "equirectangular"],
+        "osm_file": "avignon.osm",
+    },
+    "paris": {
+        "bbox": BoundaryBox([2.306823, 48.833827][::-1], [2.39067, 48.889335][::-1]),
+        "camera_types": ["spherical", "equirectangular"],
+        "osm_file": "paris.osm",
+    },
+}
+cfg = OmegaConf.create(
+    {
+        "max_image_size": 512,
+        "do_legacy_pano_offset": True,
+        "min_dist_between_keyframes": 4,
+        "tiling": {
+            "tile_size": 128,
+            "margin": 128,
+            "ppm": 2,
+        },
+    }
+)
+def get_pano_offset(image_info: dict, do_legacy: bool = False) -> float:
+    if do_legacy:
+        seed = int(image_info["sfm_cluster"]["id"])
+    else:
+        seed = image_info["sequence"].__hash__()
+    seed = seed % (2**32 - 1)
+    return np.random.RandomState(seed).uniform(-45, 45)
+def process_shot(
+    shot: Shot, info: dict, image_path: Path, output_dir: Path, cfg: DictConfig
+) -> List[Shot]:
+    if not image_path.exists():
+        return None
+    image_orig = cv2.imread(str(image_path))
+    max_size = cfg.max_image_size
+    pano_offset = None
+    camera = shot.camera
+    camera.width, camera.height = image_orig.shape[:2][::-1]
+    if camera.is_panorama(camera.projection_type):
+        camera_new = perspective_camera_from_pano(camera, max_size)
+        undistorter = PanoramaUndistorter(camera, camera_new)
+        pano_offset = get_pano_offset(info, cfg.do_legacy_pano_offset)
+    elif camera.projection_type in ["fisheye", "perspective"]:
+        if camera.projection_type == "fisheye":
+            camera_new = perspective_camera_from_fisheye(camera)
+        else:
+            camera_new = perspective_camera_from_perspective(camera)
+        camera_new = scale_camera(camera_new, max_size)
+        camera_new.id = camera.id + "_undistorted"
+        undistorter = CameraUndistorter(camera, camera_new)
+    else:
+        raise NotImplementedError(camera.projection_type)
+    shots_undist, images_undist = undistort_shot(
+        image_orig, shot, undistorter, pano_offset
+    )
+    for shot, image in zip(shots_undist, images_undist):
+        cv2.imwrite(str(output_dir / f"{shot.id}.jpg"), image)
+    return shots_undist
+def pack_shot_dict(shot: Shot, info: dict) -> dict:
+    latlong = info["computed_geometry"]["coordinates"][::-1]
+    latlong_gps = info["geometry"]["coordinates"][::-1]
+    w_p_c = shot.pose.get_origin()
+    w_r_c = shot.pose.get_R_cam_to_world()
+    rpy = decompose_rotmat(w_r_c)
+    return dict(
+        camera_id=shot.camera.id,
+        latlong=latlong,
+        t_c2w=w_p_c,
+        R_c2w=w_r_c,
+        roll_pitch_yaw=rpy,
+        capture_time=info["captured_at"],
+        gps_position=np.r_[latlong_gps, info["altitude"]],
+        compass_angle=info["compass_angle"],
+        chunk_id=int(info["sfm_cluster"]["id"]),
+    )
+def pack_camera_dict(camera: Camera) -> dict:
+    assert camera.projection_type == "perspective"
+    K = camera.get_K_in_pixel_coordinates(camera.width, camera.height)
+    return dict(
+        id=camera.id,
+        model="PINHOLE",
+        width=camera.width,
+        height=camera.height,
+        params=K[[0, 1, 0, 1], [0, 1, 2, 2]],
+    )
+def process_sequence(
+    image_ids: List[int],
+    image_infos: dict,
+    projection: Projection,
+    cfg: DictConfig,
+    raw_image_dir: Path,
+    out_image_dir: Path,
+):
+    shots = []
+    image_ids = sorted(image_ids, key=lambda i: image_infos[i]["captured_at"])
+    for i in image_ids:
+        _, shot = opensfm_shot_from_info(image_infos[i], projection)
+        shots.append(shot)
+    if not shots:
+        return {}
+    shot_idxs = keyframe_selection(shots, min_dist=cfg.min_dist_between_keyframes)
+    shots = [shots[i] for i in shot_idxs]
+    shots_out = thread_map(
+        lambda shot: process_shot(
+            shot,
+            image_infos[int(shot.id)],
+            raw_image_dir / image_filename.format(image_id=shot.id),
+            out_image_dir,
+            cfg,
+        ),
+        shots,
+        disable=True,
+    )
+    shots_out = [(i, s) for i, ss in enumerate(shots_out) for s in ss if ss is not None]
+    dump = {}
+    for index, shot in shots_out:
+        i, suffix = shot.id.rsplit("_", 1)
+        info = image_infos[int(i)]
+        seq_id = info["sequence"]
+        is_pano = not suffix.endswith("undistorted")
+        if is_pano:
+            seq_id += f"_{suffix}"
+        if seq_id not in dump:
+            dump[seq_id] = dict(views={}, cameras={})
+        view = pack_shot_dict(shot, info)
+        view["index"] = index
+        dump[seq_id]["views"][shot.id] = view
+        dump[seq_id]["cameras"][shot.camera.id] = pack_camera_dict(shot.camera)
+    return dump
+def process_location(
+    location: str,
+    data_dir: Path,
+    split_path: Path,
+    token: str,
+    generate_tiles: bool = False,
+):
+    params = location_to_params[location]
+    bbox = params["bbox"]
+    projection = Projection(*bbox.center)
+    splits = json.loads(split_path.read_text())
+    image_ids = [i for split in splits.values() for i in split[location]]
+    loc_dir = data_dir / location
+    infos_dir = loc_dir / "image_infos"
+    raw_image_dir = loc_dir / "images_raw"
+    out_image_dir = loc_dir / "images"
+    for d in (infos_dir, raw_image_dir, out_image_dir):
+        d.mkdir(parents=True, exist_ok=True)
+    downloader = MapillaryDownloader(token)
+    loop = asyncio.get_event_loop()
+    logger.info("Fetching metadata for all images.")
+    image_infos, num_fail = loop.run_until_complete(
+        fetch_image_infos(image_ids, downloader, infos_dir)
+    )
+    logger.info("%d failures (%.1f%%).", num_fail, 100 * num_fail / len(image_ids))
+    logger.info("Fetching image pixels.")
+    image_urls = [(i, info["thumb_2048_url"]) for i, info in image_infos.items()]
+    num_fail = loop.run_until_complete(
+        fetch_images_pixels(image_urls, downloader, raw_image_dir)
+    )
+    logger.info("%d failures (%.1f%%).", num_fail, 100 * num_fail / len(image_urls))
+    seq_to_image_ids = defaultdict(list)
+    for i, info in image_infos.items():
+        seq_to_image_ids[info["sequence"]].append(i)
+    seq_to_image_ids = dict(seq_to_image_ids)
+    dump = {}
+    for seq_image_ids in tqdm(seq_to_image_ids.values()):
+        dump.update(
+            process_sequence(
+                seq_image_ids,
+                image_infos,
+                projection,
+                cfg,
+                raw_image_dir,
+                out_image_dir,
+            )
+        )
+    write_json(loc_dir / "dump.json", dump)
+    # Get the view locations
+    view_ids = []
+    views_latlon = []
+    for seq in dump:
+        for view_id, view in dump[seq]["views"].items():
+            view_ids.append(view_id)
+            views_latlon.append(view["latlong"])
+    views_latlon = np.stack(views_latlon)
+    view_ids = np.array(view_ids)
+    views_xy = projection.project(views_latlon)
+    tiles_path = loc_dir / MapillaryDataModule.default_cfg["tiles_filename"]
+    if generate_tiles:
+        logger.info("Creating the map tiles.")
+        bbox_data = BoundaryBox(views_xy.min(0), views_xy.max(0))
+        bbox_tiling = bbox_data + cfg.tiling.margin
+        osm_dir = data_dir / "osm"
+        osm_path = osm_dir / params["osm_file"]
+        if not osm_path.exists():
+            logger.info("Downloading OSM raw data.")
+            download_file(DATA_URL + f"/osm/{params['osm_file']}", osm_path)
+        if not osm_path.exists():
+            raise FileNotFoundError(f"Cannot find OSM data file {osm_path}.")
+        tile_manager = TileManager.from_bbox(
+            projection,
+            bbox_tiling,
+            cfg.tiling.ppm,
+            tile_size=cfg.tiling.tile_size,
+            path=osm_path,
+        )
+        tile_manager.save(tiles_path)
+    else:
+        logger.info("Downloading pre-generated map tiles.")
+        download_file(DATA_URL + f"/tiles/{location}.pkl", tiles_path)
+    # Visualize the data split
+    plotter = GeoPlotter()
+    view_ids_val = set(splits["val"][location])
+    is_val = np.array([int(i.rsplit("_", 1)[0]) in view_ids_val for i in view_ids])
+    plotter.points(views_latlon[~is_val], "red", view_ids[~is_val], "train")
+    plotter.points(views_latlon[is_val], "green", view_ids[is_val], "val")
+    plotter.bbox(bbox, "blue", "query bounding box")
+    plotter.bbox(projection.unproject(bbox_tiling), "black", "tiling bounding box")
+    geo_viz_path = loc_dir / f"split_{location}.html"
+    plotter.fig.write_html(geo_viz_path)
+    logger.info("Wrote split visualization to %s.", geo_viz_path)
+    shutil.rmtree(raw_image_dir)
+    logger.info("Done processing for location %s.", location)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--locations", type=str, nargs="+", default=list(location_to_params)
+    )
+    parser.add_argument("--split_filename", type=str, default="splits_MGL_13loc.json")
+    parser.add_argument("--token", type=str, required=True)
+    parser.add_argument(
+        "--data_dir", type=Path, default=MapillaryDataModule.default_cfg["data_dir"]
+    )
+    parser.add_argument("--generate_tiles", action="store_true")
+    args = parser.parse_args()
+    args.data_dir.mkdir(exist_ok=True, parents=True)
+    shutil.copy(Path(__file__).parent / args.split_filename, args.data_dir)
+    for location in args.locations:
+        logger.info("Starting processing for location %s.", location)
+        process_location(
+            location,
+            args.data_dir,
+            args.data_dir / args.split_filename,
+            args.token,
+            args.generate_tiles,
+        )

maploc/data/mapillary/splits_MGL_13loc.json ADDED Viewed

The diff for this file is too large to render. See raw diff

maploc/data/mapillary/utils.py ADDED Viewed

	@@ -0,0 +1,173 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import logging
+from typing import List, Tuple
+import cv2
+import numpy as np
+from opensfm import features
+from opensfm.pygeometry import Camera, compute_camera_mapping, Pose
+from opensfm.pymap import Shot
+from scipy.spatial.transform import Rotation
+logger = logging.getLogger(__name__)
+def keyframe_selection(shots: List[Shot], min_dist: float = 4) -> List[int]:
+    camera_centers = np.stack([shot.pose.get_origin() for shot in shots], 0)
+    distances = np.linalg.norm(np.diff(camera_centers, axis=0), axis=1)
+    selected = [0]
+    cum = 0
+    for i in range(1, len(camera_centers)):
+        cum += distances[i - 1]
+        if cum >= min_dist:
+            selected.append(i)
+            cum = 0
+    return selected
+def perspective_camera_from_pano(camera: Camera, size: int) -> Camera:
+    camera_new = Camera.create_perspective(0.5, 0, 0)
+    camera_new.height = camera_new.width = size
+    camera_new.id = "perspective_from_pano"
+    return camera_new
+def scale_camera(camera: Camera, max_size: int) -> Camera:
+    height = camera.height
+    width = camera.width
+    factor = max_size / float(max(height, width))
+    if factor >= 1:
+        return camera
+    camera.width = int(round(width * factor))
+    camera.height = int(round(height * factor))
+    return camera
+class PanoramaUndistorter:
+    def __init__(self, camera_pano: Camera, camera_new: Camera):
+        w, h = camera_new.width, camera_new.height
+        self.shape = (h, w)
+        dst_y, dst_x = np.indices(self.shape).astype(np.float32)
+        dst_pixels_denormalized = np.column_stack([dst_x.ravel(), dst_y.ravel()])
+        dst_pixels = features.normalized_image_coordinates(
+            dst_pixels_denormalized, w, h
+        )
+        self.dst_bearings = camera_new.pixel_bearing_many(dst_pixels)
+        self.camera_pano = camera_pano
+        self.camera_perspective = camera_new
+    def __call__(
+        self, image: np.ndarray, panoshot: Shot, perspectiveshot: Shot
+    ) -> np.ndarray:
+        # Rotate to panorama reference frame
+        rotation = np.dot(
+            panoshot.pose.get_rotation_matrix(),
+            perspectiveshot.pose.get_rotation_matrix().T,
+        )
+        rotated_bearings = np.dot(self.dst_bearings, rotation.T)
+        # Project to panorama pixels
+        src_pixels = panoshot.camera.project_many(rotated_bearings)
+        src_pixels_denormalized = features.denormalized_image_coordinates(
+            src_pixels, image.shape[1], image.shape[0]
+        )
+        src_pixels_denormalized.shape = self.shape + (2,)
+        # Sample color
+        x = src_pixels_denormalized[..., 0].astype(np.float32)
+        y = src_pixels_denormalized[..., 1].astype(np.float32)
+        colors = cv2.remap(image, x, y, cv2.INTER_LINEAR, borderMode=cv2.BORDER_WRAP)
+        return colors
+class CameraUndistorter:
+    def __init__(self, camera_distorted: Camera, camera_new: Camera):
+        self.maps = compute_camera_mapping(
+            camera_distorted,
+            camera_new,
+            camera_distorted.width,
+            camera_distorted.height,
+        )
+        self.camera_perspective = camera_new
+        self.camera_distorted = camera_distorted
+    def __call__(self, image: np.ndarray) -> np.ndarray:
+        assert image.shape[:2] == (
+            self.camera_distorted.height,
+            self.camera_distorted.width,
+        )
+        undistorted = cv2.remap(image, *self.maps, cv2.INTER_LINEAR)
+        resized = cv2.resize(
+            undistorted,
+            (self.camera_perspective.width, self.camera_perspective.height),
+            interpolation=cv2.INTER_AREA,
+        )
+        return resized
+def render_panorama(
+    shot: Shot,
+    pano: np.ndarray,
+    undistorter: PanoramaUndistorter,
+    offset: float = 0.0,
+) -> Tuple[List[Shot], List[np.ndarray]]:
+    yaws = [0, 90, 180, 270]
+    suffixes = ["front", "left", "back", "right"]
+    images = []
+    shots = []
+    # To reduce aliasing, since cv2.remap does not support area samplimg,
+    # we first resize with anti-aliasing.
+    h, w = undistorter.shape
+    h, w = (w * 2, w * 4)  # assuming 90deg FOV
+    pano_resized = cv2.resize(pano, (w, h), interpolation=cv2.INTER_AREA)
+    for yaw, suffix in zip(yaws, suffixes):
+        R_pano2persp = Rotation.from_euler("Y", yaw + offset, degrees=True).as_matrix()
+        name = f"{shot.id}_{suffix}"
+        shot_new = Shot(
+            name,
+            undistorter.camera_perspective,
+            Pose.compose(Pose(R_pano2persp), shot.pose),
+        )
+        shot_new.metadata = shot.metadata
+        perspective = undistorter(pano_resized, shot, shot_new)
+        images.append(perspective)
+        shots.append(shot_new)
+    return shots, images
+def undistort_camera(
+    shot: Shot, image: np.ndarray, undistorter: CameraUndistorter
+) -> Tuple[Shot, np.ndarray]:
+    name = f"{shot.id}_undistorted"
+    shot_out = Shot(name, undistorter.camera_perspective, shot.pose)
+    shot_out.metadata = shot.metadata
+    undistorted = undistorter(image)
+    return shot_out, undistorted
+def undistort_shot(
+    image_raw: np.ndarray,
+    shot_orig: Shot,
+    undistorter,
+    pano_offset: float,
+) -> Tuple[List[Shot], List[np.ndarray]]:
+    camera = shot_orig.camera
+    if image_raw.shape[:2] != (camera.height, camera.width):
+        raise ValueError(
+            shot_orig.id, image_raw.shape[:2], (camera.height, camera.width)
+        )
+    if camera.is_panorama(camera.projection_type):
+        shots, undistorted = render_panorama(
+            shot_orig, image_raw, undistorter, offset=pano_offset
+        )
+    elif camera.projection_type in ("perspective", "fisheye"):
+        shot, undistorted = undistort_camera(shot_orig, image_raw, undistorter)
+        shots, undistorted = [shot], [undistorted]
+    else:
+        raise NotImplementedError(camera.projection_type)
+    return shots, undistorted

maploc/data/sequential.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import torch
+def chunk_sequence(
+    data,
+    indices,
+    *,
+    names=None,
+    max_length=100,
+    min_length=1,
+    max_delay_s=None,
+    max_inter_dist=None,
+    max_total_dist=None,
+):
+    sort_array = data.get("capture_time", data.get("index", names or indices))
+    indices = sorted(indices, key=lambda i: sort_array[i].tolist())
+    centers = torch.stack([data["t_c2w"][i][:2] for i in indices]).numpy()
+    dists = np.linalg.norm(np.diff(centers, axis=0), axis=-1)
+    if "capture_time" in data:
+        times = torch.stack([data["capture_time"][i] for i in indices])
+        times = times.double() / 1e3  # ms to s
+        delays = np.diff(times, axis=0)
+    else:
+        delays = np.zeros_like(dists)
+    chunks = [[indices[0]]]
+    dist_total = 0
+    for dist, delay, idx in zip(dists, delays, indices[1:]):
+        dist_total += dist
+        if (
+            (max_inter_dist is not None and dist > max_inter_dist)
+            or (max_total_dist is not None and dist_total > max_total_dist)
+            or (max_delay_s is not None and delay > max_delay_s)
+            or len(chunks[-1]) >= max_length
+        ):
+            chunks.append([])
+            dist_total = 0
+        chunks[-1].append(idx)
+    chunks = list(filter(lambda c: len(c) >= min_length, chunks))
+    chunks = sorted(chunks, key=len, reverse=True)
+    return chunks
+def unpack_batches(batches):
+    images = [b["image"].permute(1, 2, 0) for b in batches]
+    canvas = [b["canvas"] for b in batches]
+    rasters = [b["map"] for b in batches]
+    yaws = torch.stack([b["roll_pitch_yaw"][-1] for b in batches])
+    uv_gt = torch.stack([b["uv"] for b in batches])
+    xy_gt = torch.stack(
+        [canv.to_xy(uv.cpu().double()) for uv, canv in zip(uv_gt, canvas)]
+    )
+    ret = [images, canvas, rasters, yaws, uv_gt, xy_gt.to(uv_gt)]
+    if "uv_gps" in batches[0]:
+        xy_gps = torch.stack(
+            [c.to_xy(b["uv_gps"].cpu().double()) for b, c in zip(batches, canvas)]
+        )
+        ret.append(xy_gps.to(uv_gt))
+    return ret

maploc/data/torch.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import collections
+import os
+import torch
+from torch.utils.data import get_worker_info
+from torch.utils.data._utils.collate import (
+    default_collate_err_msg_format,
+    np_str_obj_array_pattern,
+)
+from lightning_fabric.utilities.seed import pl_worker_init_function
+from lightning_utilities.core.apply_func import apply_to_collection
+from lightning_fabric.utilities.apply_func import move_data_to_device
+def collate(batch):
+    """Difference with PyTorch default_collate: it can stack other tensor-like objects.
+    Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+    https://github.com/cvg/pixloc
+    Released under the Apache License 2.0
+    """
+    if not isinstance(batch, list):  # no batching
+        return batch
+    elem = batch[0]
+    elem_type = type(elem)
+    if isinstance(elem, torch.Tensor):
+        out = None
+        if torch.utils.data.get_worker_info() is not None:
+            # If we're in a background process, concatenate directly into a
+            # shared memory tensor to avoid an extra copy
+            numel = sum(x.numel() for x in batch)
+            storage = elem.storage()._new_shared(numel, device=elem.device)
+            out = elem.new(storage).resize_(len(batch), *list(elem.size()))
+        return torch.stack(batch, 0, out=out)
+    elif (
+        elem_type.__module__ == "numpy"
+        and elem_type.__name__ != "str_"
+        and elem_type.__name__ != "string_"
+    ):
+        if elem_type.__name__ == "ndarray" or elem_type.__name__ == "memmap":
+            # array of string classes and object
+            if np_str_obj_array_pattern.search(elem.dtype.str) is not None:
+                raise TypeError(default_collate_err_msg_format.format(elem.dtype))
+            return collate([torch.as_tensor(b) for b in batch])
+        elif elem.shape == ():  # scalars
+            return torch.as_tensor(batch)
+    elif isinstance(elem, float):
+        return torch.tensor(batch, dtype=torch.float64)
+    elif isinstance(elem, int):
+        return torch.tensor(batch)
+    elif isinstance(elem, (str, bytes)):
+        return batch
+    elif isinstance(elem, collections.abc.Mapping):
+        return {key: collate([d[key] for d in batch]) for key in elem}
+    elif isinstance(elem, tuple) and hasattr(elem, "_fields"):  # namedtuple
+        return elem_type(*(collate(samples) for samples in zip(*batch)))
+    elif isinstance(elem, collections.abc.Sequence):
+        # check to make sure that the elements in batch have consistent size
+        it = iter(batch)
+        elem_size = len(next(it))
+        if not all(len(elem) == elem_size for elem in it):
+            raise RuntimeError("each element in list of batch should be of equal size")
+        transposed = zip(*batch)
+        return [collate(samples) for samples in transposed]
+    else:
+        # try to stack anyway in case the object implements stacking.
+        try:
+            return torch.stack(batch, 0)
+        except TypeError as e:
+            if "expected Tensor as element" in str(e):
+                return batch
+            else:
+                raise e
+def set_num_threads(nt):
+    """Force numpy and other libraries to use a limited number of threads."""
+    try:
+        import mkl
+    except ImportError:
+        pass
+    else:
+        mkl.set_num_threads(nt)
+    torch.set_num_threads(1)
+    os.environ["IPC_ENABLE"] = "1"
+    for o in [
+        "OPENBLAS_NUM_THREADS",
+        "NUMEXPR_NUM_THREADS",
+        "OMP_NUM_THREADS",
+        "MKL_NUM_THREADS",
+    ]:
+        os.environ[o] = str(nt)
+def worker_init_fn(i):
+    info = get_worker_info()
+    pl_worker_init_function(info.id)
+    num_threads = info.dataset.cfg.get("num_threads")
+    if num_threads is not None:
+        set_num_threads(num_threads)
+def unbatch_to_device(data, device="cpu"):
+    data = move_data_to_device(data, device)
+    data = apply_to_collection(data, torch.Tensor, lambda x: x.squeeze(0))
+    data = apply_to_collection(
+        data, list, lambda x: x[0] if len(x) == 1 and isinstance(x[0], str) else x
+    )
+    return data

maploc/data/utils.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+from scipy.spatial.transform import Rotation
+def crop_map(raster, xy, size, seed=None):
+    h, w = raster.shape[-2:]
+    state = np.random.RandomState(seed)
+    top = state.randint(0, h - size + 1)
+    left = state.randint(0, w - size + 1)
+    raster = raster[..., top : top + size, left : left + size]
+    xy -= np.array([left, top])
+    return raster, xy
+def random_rot90(raster, xy, heading, seed=None):
+    rot = np.random.RandomState(seed).randint(0, 4)
+    heading = (heading + rot * np.pi / 2) % (2 * np.pi)
+    h, w = raster.shape[-2:]
+    if rot == 0:
+        xy2 = xy
+    elif rot == 2:
+        xy2 = np.array([w, h]) - 1 - xy
+    elif rot == 1:
+        xy2 = np.array([xy[1], w - 1 - xy[0]])
+    elif rot == 3:
+        xy2 = np.array([h - 1 - xy[1], xy[0]])
+    else:
+        raise ValueError(rot)
+    raster = np.rot90(raster, rot, axes=(-2, -1))
+    return raster, xy2, heading
+def random_flip(image, raster, xy, heading, seed=None):
+    state = np.random.RandomState(seed)
+    if state.rand() > 0.5:  # no flip
+        return image, raster, xy, heading
+    image = image[:, ::-1]
+    h, w = raster.shape[-2:]
+    if state.rand() > 0.5:  # flip x
+        raster = raster[..., :, ::-1]
+        xy = np.array([w - 1 - xy[0], xy[1]])
+        heading = np.pi - heading
+    else:  # flip y
+        raster = raster[..., ::-1, :]
+        xy = np.array([xy[0], h - 1 - xy[1]])
+        heading = -heading
+    heading = heading % (2 * np.pi)
+    return image, raster, xy, heading
+def decompose_rotmat(R_c2w):
+    R_cv2xyz = Rotation.from_euler("X", -90, degrees=True)
+    rot_w2c = R_cv2xyz * Rotation.from_matrix(R_c2w).inv()
+    roll, pitch, yaw = rot_w2c.as_euler("YXZ", degrees=True)
+    # rot_w2c_check = R_cv2xyz.inv() * Rotation.from_euler('YXZ', [roll, pitch, yaw], degrees=True)
+    # np.testing.assert_allclose(rot_w2c_check.as_matrix(), R_c2w.T, rtol=1e-6, atol=1e-6)
+    # R_plane2c = Rotation.from_euler("ZX", [roll, pitch], degrees=True).as_matrix()
+    return roll, pitch, yaw

maploc/demo.py ADDED Viewed

	@@ -0,0 +1,209 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+from typing import Optional, Tuple
+import torch
+import numpy as np
+from . import logger
+from .evaluation.run import resolve_checkpoint_path, pretrained_models
+from .models.orienternet import OrienterNet
+from .models.voting import fuse_gps, argmax_xyr
+from .data.image import resize_image, pad_image, rectify_image
+from .osm.raster import Canvas
+from .utils.wrappers import Camera
+from .utils.io import read_image
+from .utils.geo import BoundaryBox, Projection
+from .utils.exif import EXIF
+try:
+    from geopy.geocoders import Nominatim
+    geolocator = Nominatim(user_agent="orienternet")
+except ImportError:
+    geolocator = None
+try:
+    from gradio_client import Client
+    calibrator = Client("https://jinlinyi-perspectivefields.hf.space/")
+except (ImportError, ValueError):
+    calibrator = None
+def image_calibration(image_path):
+    logger.info("Calling the PerspectiveFields calibrator, this may take some time.")
+    result = calibrator.predict(
+        image_path, "NEW:Paramnet-360Cities-edina-centered", api_name="/predict"
+    )
+    result = dict(r.rsplit(" ", 1) for r in result[1].split("\n"))
+    roll_pitch = float(result["roll"]), float(result["pitch"])
+    return roll_pitch, float(result["vertical fov"])
+def camera_from_exif(exif: EXIF, fov: Optional[float] = None) -> Camera:
+    w, h = image_size = exif.extract_image_size()
+    _, f_ratio = exif.extract_focal()
+    if f_ratio == 0:
+        if fov is not None:
+            # This is the vertical FoV.
+            f = h / 2 / np.tan(np.deg2rad(fov) / 2)
+        else:
+            return None
+    else:
+        f = f_ratio * max(image_size)
+    return Camera.from_dict(
+        dict(
+            model="SIMPLE_PINHOLE",
+            width=w,
+            height=h,
+            params=[f, w / 2 + 0.5, h / 2 + 0.5],
+        )
+    )
+def read_input_image(
+    image_path: str,
+    prior_latlon: Optional[Tuple[float, float]] = None,
+    prior_address: Optional[str] = None,
+    fov: Optional[float] = None,
+    tile_size_meters: int = 64,
+):
+    image = read_image(image_path)
+    roll_pitch = None
+    if calibrator is not None:
+        roll_pitch, fov = image_calibration(image_path)
+    else:
+        logger.info("Could not call PerspectiveFields, maybe install gradio_client?")
+    if roll_pitch is not None:
+        logger.info("Using (roll, pitch) %s.", roll_pitch)
+    with open(image_path, "rb") as fid:
+        exif = EXIF(fid, lambda: image.shape[:2])
+    camera = camera_from_exif(exif, fov)
+    if camera is None:
+        raise ValueError(
+            "No camera intrinsics found in the EXIF, provide an FoV guess."
+        )
+    latlon = None
+    if prior_latlon is not None:
+        latlon = prior_latlon
+        logger.info("Using prior latlon %s.", prior_latlon)
+    if prior_address is not None:
+        if geolocator is None:
+            raise ValueError("geocoding unavailable, install geopy.")
+        location = geolocator.geocode(prior_address)
+        if location is None:
+            logger.info("Could not find any location for %s.", prior_address)
+        else:
+            logger.info("Using prior address: %s", location.address)
+            latlon = (location.latitude, location.longitude)
+    if latlon is None:
+        geo = exif.extract_geo()
+        if geo:
+            alt = geo.get("altitude", 0)  # read if available
+            latlon = (geo["latitude"], geo["longitude"], alt)
+            logger.info("Using prior location from EXIF.")
+        else:
+            logger.info("Could not find any prior location in EXIF.")
+    if latlon is None:
+        raise ValueError("Need prior latlon")
+    latlon = np.array(latlon)
+    proj = Projection(*latlon)
+    center = proj.project(latlon)
+    bbox = BoundaryBox(center, center) + tile_size_meters
+    return image, camera, roll_pitch, proj, bbox, latlon
+class Demo:
+    def __init__(
+        self,
+        experiment_or_path: Optional[str] = "OrienterNet_MGL",
+        device=None,
+        **kwargs
+    ):
+        if experiment_or_path in pretrained_models:
+            experiment_or_path, _ = pretrained_models[experiment_or_path]
+        path = resolve_checkpoint_path(experiment_or_path)
+        ckpt = torch.load(path, map_location=(lambda storage, loc: storage))
+        config = ckpt["hyper_parameters"]
+        config.model.update(kwargs)
+        config.model.image_encoder.backbone.pretrained = False
+        model = OrienterNet(config.model).eval()
+        state = {k[len("model.") :]: v for k, v in ckpt["state_dict"].items()}
+        model.load_state_dict(state, strict=True)
+        if device is None:
+            device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        model = model.to(device)
+        self.model = model
+        self.config = config
+        self.device = device
+    def prepare_data(
+        self,
+        image: np.ndarray,
+        camera: Camera,
+        canvas: Canvas,
+        roll_pitch: Optional[Tuple[float]] = None,
+    ):
+        assert image.shape[:2][::-1] == tuple(camera.size.tolist())
+        target_focal_length = self.config.data.resize_image / 2
+        factor = target_focal_length / camera.f
+        size = (camera.size * factor).round().int()
+        image = torch.from_numpy(image).permute(2, 0, 1).float().div_(255)
+        valid = None
+        if roll_pitch is not None:
+            roll, pitch = roll_pitch
+            image, valid = rectify_image(
+                image,
+                camera.float(),
+                roll=-roll,
+                pitch=-pitch,
+            )
+        image, _, camera, *maybe_valid = resize_image(
+            image, size.numpy(), camera=camera, valid=valid
+        )
+        valid = None if valid is None else maybe_valid
+        max_stride = max(self.model.image_encoder.layer_strides)
+        size = (np.ceil((size / max_stride)) * max_stride).int()
+        image, valid, camera = pad_image(
+            image, size.numpy(), camera, crop_and_center=True
+        )
+        return dict(
+            image=image,
+            map=torch.from_numpy(canvas.raster).long(),
+            camera=camera.float(),
+            valid=valid,
+        )
+    def localize(self, image: np.ndarray, camera: Camera, canvas: Canvas, **kwargs):
+        data = self.prepare_data(image, camera, canvas, **kwargs)
+        data_ = {k: v.to(self.device)[None] for k, v in data.items()}
+        with torch.no_grad():
+            pred = self.model(data_)
+        xy_gps = canvas.bbox.center
+        uv_gps = torch.from_numpy(canvas.to_uv(xy_gps))
+        lp_xyr = pred["log_probs"].squeeze(0)
+        tile_size = canvas.bbox.size.min() / 2
+        sigma = tile_size - 20  # 20 meters margin
+        lp_xyr = fuse_gps(
+            lp_xyr,
+            uv_gps.to(lp_xyr),
+            self.config.model.pixel_per_meter,
+            sigma=sigma,
+        )
+        xyr = argmax_xyr(lp_xyr).cpu()
+        prob = lp_xyr.exp().cpu()
+        neural_map = pred["map"]["map_features"][0].squeeze(0).cpu()
+        return xyr[:2], xyr[1], prob, neural_map, data["image"]

maploc/evaluation/kitti.py ADDED Viewed

	@@ -0,0 +1,89 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import argparse
+from pathlib import Path
+from typing import Optional, Tuple
+from omegaconf import OmegaConf, DictConfig
+from .. import logger
+from ..data import KittiDataModule
+from .run import evaluate
+default_cfg_single = OmegaConf.create({})
+# For the sequential evaluation, we need to center the map around the GT location,
+# since random offsets would accumulate and leave only the GT location with a valid mask.
+# This should not have much impact on the results.
+default_cfg_sequential = OmegaConf.create(
+    {
+        "data": {
+            "mask_radius": KittiDataModule.default_cfg["max_init_error"],
+            "prior_range_rotation": KittiDataModule.default_cfg[
+                "max_init_error_rotation"
+            ]
+            + 1,
+            "max_init_error": 0,
+            "max_init_error_rotation": 0,
+        },
+        "chunking": {
+            "max_length": 100,  # about 10s?
+        },
+    }
+)
+def run(
+    split: str,
+    experiment: str,
+    cfg: Optional[DictConfig] = None,
+    sequential: bool = False,
+    thresholds: Tuple[int] = (1, 3, 5),
+    **kwargs,
+):
+    cfg = cfg or {}
+    if isinstance(cfg, dict):
+        cfg = OmegaConf.create(cfg)
+    default = default_cfg_sequential if sequential else default_cfg_single
+    cfg = OmegaConf.merge(default, cfg)
+    dataset = KittiDataModule(cfg.get("data", {}))
+    metrics = evaluate(
+        experiment,
+        cfg,
+        dataset,
+        split=split,
+        sequential=sequential,
+        viz_kwargs=dict(show_dir_error=True, show_masked_prob=False),
+        **kwargs,
+    )
+    keys = ["directional_error", "yaw_max_error"]
+    if sequential:
+        keys += ["directional_seq_error", "yaw_seq_error"]
+    for k in keys:
+        rec = metrics[k].recall(thresholds).double().numpy().round(2).tolist()
+        logger.info("Recall %s: %s at %s m/°", k, rec, thresholds)
+    return metrics
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--experiment", type=str, required=True)
+    parser.add_argument(
+        "--split", type=str, default="test", choices=["test", "val", "train"]
+    )
+    parser.add_argument("--sequential", action="store_true")
+    parser.add_argument("--output_dir", type=Path)
+    parser.add_argument("--num", type=int)
+    parser.add_argument("dotlist", nargs="*")
+    args = parser.parse_args()
+    cfg = OmegaConf.from_cli(args.dotlist)
+    run(
+        args.split,
+        args.experiment,
+        cfg,
+        args.sequential,
+        output_dir=args.output_dir,
+        num=args.num,
+    )

maploc/evaluation/mapillary.py ADDED Viewed

	@@ -0,0 +1,111 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import argparse
+from pathlib import Path
+from typing import Optional, Tuple
+from omegaconf import OmegaConf, DictConfig
+from .. import logger
+from ..conf import data as conf_data_dir
+from ..data import MapillaryDataModule
+from .run import evaluate
+split_overrides = {
+    "val": {
+        "scenes": [
+            "sanfrancisco_soma",
+            "sanfrancisco_hayes",
+            "amsterdam",
+            "berlin",
+            "lemans",
+            "montrouge",
+            "toulouse",
+            "nantes",
+            "vilnius",
+            "avignon",
+            "helsinki",
+            "milan",
+            "paris",
+        ],
+    },
+}
+data_cfg_train = OmegaConf.load(Path(conf_data_dir.__file__).parent / "mapillary.yaml")
+data_cfg = OmegaConf.merge(
+    data_cfg_train,
+    {
+        "return_gps": True,
+        "add_map_mask": True,
+        "max_init_error": 32,
+        "loading": {"val": {"batch_size": 1, "num_workers": 0}},
+    },
+)
+default_cfg_single = OmegaConf.create({"data": data_cfg})
+default_cfg_sequential = OmegaConf.create(
+    {
+        **default_cfg_single,
+        "chunking": {
+            "max_length": 10,
+        },
+    }
+)
+def run(
+    split: str,
+    experiment: str,
+    cfg: Optional[DictConfig] = None,
+    sequential: bool = False,
+    thresholds: Tuple[int] = (1, 3, 5),
+    **kwargs,
+):
+    cfg = cfg or {}
+    if isinstance(cfg, dict):
+        cfg = OmegaConf.create(cfg)
+    default = default_cfg_sequential if sequential else default_cfg_single
+    default = OmegaConf.merge(default, split_overrides[split])
+    cfg = OmegaConf.merge(default, cfg)
+    dataset = MapillaryDataModule(cfg.get("data", {}))
+    metrics = evaluate(experiment, cfg, dataset, split, sequential=sequential, **kwargs)
+    keys = [
+        "xy_max_error",
+        "xy_gps_error",
+        "yaw_max_error",
+    ]
+    if sequential:
+        keys += [
+            "xy_seq_error",
+            "xy_gps_seq_error",
+            "yaw_seq_error",
+            "yaw_gps_seq_error",
+        ]
+    for k in keys:
+        if k not in metrics:
+            logger.warning("Key %s not in metrics.", k)
+            continue
+        rec = metrics[k].recall(thresholds).double().numpy().round(2).tolist()
+        logger.info("Recall %s: %s at %s m/°", k, rec, thresholds)
+    return metrics
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--experiment", type=str, required=True)
+    parser.add_argument("--split", type=str, default="val", choices=["val"])
+    parser.add_argument("--sequential", action="store_true")
+    parser.add_argument("--output_dir", type=Path)
+    parser.add_argument("--num", type=int)
+    parser.add_argument("dotlist", nargs="*")
+    args = parser.parse_args()
+    cfg = OmegaConf.from_cli(args.dotlist)
+    run(
+        args.split,
+        args.experiment,
+        cfg,
+        args.sequential,
+        output_dir=args.output_dir,
+        num=args.num,
+    )

maploc/evaluation/run.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import functools
+from itertools import islice
+from typing import Callable, Dict, Optional, Tuple
+from pathlib import Path
+import numpy as np
+import torch
+from omegaconf import DictConfig, OmegaConf
+from torchmetrics import MetricCollection
+from pytorch_lightning import seed_everything
+from tqdm import tqdm
+from .. import logger, EXPERIMENTS_PATH
+from ..data.torch import collate, unbatch_to_device
+from ..models.voting import argmax_xyr, fuse_gps
+from ..models.metrics import AngleError, LateralLongitudinalError, Location2DError
+from ..models.sequential import GPSAligner, RigidAligner
+from ..module import GenericModule
+from ..utils.io import download_file, DATA_URL
+from .viz import plot_example_single, plot_example_sequential
+from .utils import write_dump
+pretrained_models = dict(
+    OrienterNet_MGL=("orienternet_mgl.ckpt", dict(num_rotations=256)),
+)
+def resolve_checkpoint_path(experiment_or_path: str) -> Path:
+    path = Path(experiment_or_path)
+    if not path.exists():
+        # provided name of experiment
+        path = Path(EXPERIMENTS_PATH, *experiment_or_path.split("/"))
+        if not path.exists():
+            if experiment_or_path in set(p for p, _ in pretrained_models.values()):
+                download_file(f"{DATA_URL}/{experiment_or_path}", path)
+            else:
+                raise FileNotFoundError(path)
+    if path.is_file():
+        return path
+    # provided only the experiment name
+    maybe_path = path / "last-step.ckpt"
+    if not maybe_path.exists():
+        maybe_path = path / "step.ckpt"
+    if not maybe_path.exists():
+        raise FileNotFoundError(f"Could not find any checkpoint in {path}.")
+    return maybe_path
+@torch.no_grad()
+def evaluate_single_image(
+    dataloader: torch.utils.data.DataLoader,
+    model: GenericModule,
+    num: Optional[int] = None,
+    callback: Optional[Callable] = None,
+    progress: bool = True,
+    mask_index: Optional[Tuple[int]] = None,
+    has_gps: bool = False,
+):
+    ppm = model.model.conf.pixel_per_meter
+    metrics = MetricCollection(model.model.metrics())
+    metrics["directional_error"] = LateralLongitudinalError(ppm)
+    if has_gps:
+        metrics["xy_gps_error"] = Location2DError("uv_gps", ppm)
+        metrics["xy_fused_error"] = Location2DError("uv_fused", ppm)
+        metrics["yaw_fused_error"] = AngleError("yaw_fused")
+    metrics = metrics.to(model.device)
+    for i, batch_ in enumerate(
+        islice(tqdm(dataloader, total=num, disable=not progress), num)
+    ):
+        batch = model.transfer_batch_to_device(batch_, model.device, i)
+        # Ablation: mask semantic classes
+        if mask_index is not None:
+            mask = batch["map"][0, mask_index[0]] == (mask_index[1] + 1)
+            batch["map"][0, mask_index[0]][mask] = 0
+        pred = model(batch)
+        if has_gps:
+            (uv_gps,) = pred["uv_gps"] = batch["uv_gps"]
+            pred["log_probs_fused"] = fuse_gps(
+                pred["log_probs"], uv_gps, ppm, sigma=batch["accuracy_gps"]
+            )
+            uvt_fused = argmax_xyr(pred["log_probs_fused"])
+            pred["uv_fused"] = uvt_fused[..., :2]
+            pred["yaw_fused"] = uvt_fused[..., -1]
+            del uv_gps, uvt_fused
+        results = metrics(pred, batch)
+        if callback is not None:
+            callback(
+                i, model, unbatch_to_device(pred), unbatch_to_device(batch_), results
+            )
+        del batch_, batch, pred, results
+    return metrics.cpu()
+@torch.no_grad()
+def evaluate_sequential(
+    dataset: torch.utils.data.Dataset,
+    chunk2idx: Dict,
+    model: GenericModule,
+    num: Optional[int] = None,
+    shuffle: bool = False,
+    callback: Optional[Callable] = None,
+    progress: bool = True,
+    num_rotations: int = 512,
+    mask_index: Optional[Tuple[int]] = None,
+    has_gps: bool = True,
+):
+    chunk_keys = list(chunk2idx)
+    if shuffle:
+        chunk_keys = [chunk_keys[i] for i in torch.randperm(len(chunk_keys))]
+    if num is not None:
+        chunk_keys = chunk_keys[:num]
+    lengths = [len(chunk2idx[k]) for k in chunk_keys]
+    logger.info(
+        "Min/max/med lengths: %d/%d/%d, total number of images: %d",
+        min(lengths),
+        np.median(lengths),
+        max(lengths),
+        sum(lengths),
+    )
+    viz = callback is not None
+    metrics = MetricCollection(model.model.metrics())
+    ppm = model.model.conf.pixel_per_meter
+    metrics["directional_error"] = LateralLongitudinalError(ppm)
+    metrics["xy_seq_error"] = Location2DError("uv_seq", ppm)
+    metrics["yaw_seq_error"] = AngleError("yaw_seq")
+    metrics["directional_seq_error"] = LateralLongitudinalError(ppm, key="uv_seq")
+    if has_gps:
+        metrics["xy_gps_error"] = Location2DError("uv_gps", ppm)
+        metrics["xy_gps_seq_error"] = Location2DError("uv_gps_seq", ppm)
+        metrics["yaw_gps_seq_error"] = AngleError("yaw_gps_seq")
+    metrics = metrics.to(model.device)
+    keys_save = ["uvr_max", "uv_max", "yaw_max", "uv_expectation"]
+    if has_gps:
+        keys_save.append("uv_gps")
+    if viz:
+        keys_save.append("log_probs")
+    for chunk_index, key in enumerate(tqdm(chunk_keys, disable=not progress)):
+        indices = chunk2idx[key]
+        aligner = RigidAligner(track_priors=viz, num_rotations=num_rotations)
+        if has_gps:
+            aligner_gps = GPSAligner(track_priors=viz, num_rotations=num_rotations)
+        batches = []
+        preds = []
+        for i in indices:
+            data = dataset[i]
+            data = model.transfer_batch_to_device(data, model.device, 0)
+            pred = model(collate([data]))
+            canvas = data["canvas"]
+            data["xy_geo"] = xy = canvas.to_xy(data["uv"].double())
+            data["yaw"] = yaw = data["roll_pitch_yaw"][-1].double()
+            aligner.update(pred["log_probs"][0], canvas, xy, yaw)
+            if has_gps:
+                (uv_gps) = pred["uv_gps"] = data["uv_gps"][None]
+                xy_gps = canvas.to_xy(uv_gps.double())
+                aligner_gps.update(xy_gps, data["accuracy_gps"], canvas, xy, yaw)
+            if not viz:
+                data.pop("image")
+                data.pop("map")
+            batches.append(data)
+            preds.append({k: pred[k][0] for k in keys_save})
+            del pred
+        xy_gt = torch.stack([b["xy_geo"] for b in batches])
+        yaw_gt = torch.stack([b["yaw"] for b in batches])
+        aligner.compute()
+        xy_seq, yaw_seq = aligner.transform(xy_gt, yaw_gt)
+        if has_gps:
+            aligner_gps.compute()
+            xy_gps_seq, yaw_gps_seq = aligner_gps.transform(xy_gt, yaw_gt)
+        results = []
+        for i in range(len(indices)):
+            preds[i]["uv_seq"] = batches[i]["canvas"].to_uv(xy_seq[i]).float()
+            preds[i]["yaw_seq"] = yaw_seq[i].float()
+            if has_gps:
+                preds[i]["uv_gps_seq"] = (
+                    batches[i]["canvas"].to_uv(xy_gps_seq[i]).float()
+                )
+                preds[i]["yaw_gps_seq"] = yaw_gps_seq[i].float()
+            results.append(metrics(preds[i], batches[i]))
+        if viz:
+            callback(chunk_index, model, batches, preds, results, aligner)
+        del aligner, preds, batches, results
+    return metrics.cpu()
+def evaluate(
+    experiment: str,
+    cfg: DictConfig,
+    dataset,
+    split: str,
+    sequential: bool = False,
+    output_dir: Optional[Path] = None,
+    callback: Optional[Callable] = None,
+    num_workers: int = 1,
+    viz_kwargs=None,
+    **kwargs,
+):
+    if experiment in pretrained_models:
+        experiment, cfg_override = pretrained_models[experiment]
+        cfg = OmegaConf.merge(OmegaConf.create(dict(model=cfg_override)), cfg)
+    logger.info("Evaluating model %s with config %s", experiment, cfg)
+    checkpoint_path = resolve_checkpoint_path(experiment)
+    model = GenericModule.load_from_checkpoint(
+        checkpoint_path, cfg=cfg, find_best=not experiment.endswith(".ckpt")
+    )
+    model = model.eval()
+    if torch.cuda.is_available():
+        model = model.cuda()
+    dataset.prepare_data()
+    dataset.setup()
+    if output_dir is not None:
+        output_dir.mkdir(exist_ok=True, parents=True)
+        if callback is None:
+            if sequential:
+                callback = plot_example_sequential
+            else:
+                callback = plot_example_single
+            callback = functools.partial(
+                callback, out_dir=output_dir, **(viz_kwargs or {})
+            )
+    kwargs = {**kwargs, "callback": callback}
+    seed_everything(dataset.cfg.seed)
+    if sequential:
+        dset, chunk2idx = dataset.sequence_dataset(split, **cfg.chunking)
+        metrics = evaluate_sequential(dset, chunk2idx, model, **kwargs)
+    else:
+        loader = dataset.dataloader(split, shuffle=True, num_workers=num_workers)
+        metrics = evaluate_single_image(loader, model, **kwargs)
+    results = metrics.compute()
+    logger.info("All results: %s", results)
+    if output_dir is not None:
+        write_dump(output_dir, experiment, cfg, results, metrics)
+        logger.info("Outputs have been written to %s.", output_dir)
+    return metrics

maploc/evaluation/utils.py ADDED Viewed

	@@ -0,0 +1,40 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+from omegaconf import OmegaConf
+from ..utils.io import write_json
+def compute_recall(errors):
+    num_elements = len(errors)
+    sort_idx = np.argsort(errors)
+    errors = np.array(errors.copy())[sort_idx]
+    recall = (np.arange(num_elements) + 1) / num_elements
+    recall = np.r_[0, recall]
+    errors = np.r_[0, errors]
+    return errors, recall
+def compute_auc(errors, recall, thresholds):
+    aucs = []
+    for t in thresholds:
+        last_index = np.searchsorted(errors, t, side="right")
+        r = np.r_[recall[:last_index], recall[last_index - 1]]
+        e = np.r_[errors[:last_index], t]
+        auc = np.trapz(r, x=e) / t
+        aucs.append(auc * 100)
+    return aucs
+def write_dump(output_dir, experiment, cfg, results, metrics):
+    dump = {
+        "experiment": experiment,
+        "cfg": OmegaConf.to_container(cfg),
+        "results": results,
+        "errors": {},
+    }
+    for k, m in metrics.items():
+        if hasattr(m, "get_errors"):
+            dump["errors"][k] = m.get_errors().numpy()
+    write_json(output_dir / "log.json", dump)

maploc/evaluation/viz.py ADDED Viewed

	@@ -0,0 +1,178 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import numpy as np
+import torch
+import matplotlib.pyplot as plt
+from ..utils.io import write_torch_image
+from ..utils.viz_2d import plot_images, features_to_RGB, save_plot
+from ..utils.viz_localization import (
+    likelihood_overlay,
+    plot_pose,
+    plot_dense_rotations,
+    add_circle_inset,
+)
+from ..osm.viz import Colormap, plot_nodes
+def plot_example_single(
+    idx,
+    model,
+    pred,
+    data,
+    results,
+    plot_bev=True,
+    out_dir=None,
+    fig_for_paper=False,
+    show_gps=False,
+    show_fused=False,
+    show_dir_error=False,
+    show_masked_prob=False,
+):
+    scene, name, rasters, uv_gt = (data[k] for k in ("scene", "name", "map", "uv"))
+    uv_gps = data.get("uv_gps")
+    yaw_gt = data["roll_pitch_yaw"][-1].numpy()
+    image = data["image"].permute(1, 2, 0)
+    if "valid" in data:
+        image = image.masked_fill(~data["valid"].unsqueeze(-1), 0.3)
+    lp_uvt = lp_uv = pred["log_probs"]
+    if show_fused and "log_probs_fused" in pred:
+        lp_uvt = lp_uv = pred["log_probs_fused"]
+    elif not show_masked_prob and "scores_unmasked" in pred:
+        lp_uvt = lp_uv = pred["scores_unmasked"]
+    has_rotation = lp_uvt.ndim == 3
+    if has_rotation:
+        lp_uv = lp_uvt.max(-1).values
+    if lp_uv.min() > -np.inf:
+        lp_uv = lp_uv.clip(min=np.percentile(lp_uv, 1))
+    prob = lp_uv.exp()
+    uv_p, yaw_p = pred["uv_max"], pred.get("yaw_max")
+    if show_fused and "uv_fused" in pred:
+        uv_p, yaw_p = pred["uv_fused"], pred.get("yaw_fused")
+    feats_map = pred["map"]["map_features"][0]
+    (feats_map_rgb,) = features_to_RGB(feats_map.numpy())
+    text1 = rf'$\Delta xy$: {results["xy_max_error"]:.1f}m'
+    if has_rotation:
+        text1 += rf', $\Delta\theta$: {results["yaw_max_error"]:.1f}°'
+    if show_fused and "xy_fused_error" in results:
+        text1 += rf', $\Delta xy_{{fused}}$: {results["xy_fused_error"]:.1f}m'
+        text1 += rf', $\Delta\theta_{{fused}}$: {results["yaw_fused_error"]:.1f}°'
+    if show_dir_error and "directional_error" in results:
+        err_lat, err_lon = results["directional_error"]
+        text1 += rf",  $\Delta$lateral/longitundinal={err_lat:.1f}m/{err_lon:.1f}m"
+    if "xy_gps_error" in results:
+        text1 += rf',  $\Delta xy_{{GPS}}$: {results["xy_gps_error"]:.1f}m'
+    map_viz = Colormap.apply(rasters)
+    overlay = likelihood_overlay(prob.numpy(), map_viz.mean(-1, keepdims=True))
+    plot_images(
+        [image, map_viz, overlay, feats_map_rgb],
+        titles=[text1, "map", "likelihood", "neural map"],
+        dpi=75,
+        cmaps="jet",
+    )
+    fig = plt.gcf()
+    axes = fig.axes
+    axes[1].images[0].set_interpolation("none")
+    axes[2].images[0].set_interpolation("none")
+    Colormap.add_colorbar()
+    plot_nodes(1, rasters[2])
+    if show_gps and uv_gps is not None:
+        plot_pose([1], uv_gps, c="blue")
+    plot_pose([1], uv_gt, yaw_gt, c="red")
+    plot_pose([1], uv_p, yaw_p, c="k")
+    plot_dense_rotations(2, lp_uvt.exp())
+    inset_center = pred["uv_max"] if results["xy_max_error"] < 5 else uv_gt
+    axins = add_circle_inset(axes[2], inset_center)
+    axins.scatter(*uv_gt, lw=1, c="red", ec="k", s=50, zorder=15)
+    axes[0].text(
+        0.003,
+        0.003,
+        f"{scene}/{name}",
+        transform=axes[0].transAxes,
+        fontsize=3,
+        va="bottom",
+        ha="left",
+        color="w",
+    )
+    plt.show()
+    if out_dir is not None:
+        name_ = name.replace("/", "_")
+        p = str(out_dir / f"{scene}_{name_}_{{}}.pdf")
+        save_plot(p.format("pred"))
+        plt.close()
+        if fig_for_paper:
+            # !cp ../datasets/MGL/{scene}/images/{name}.jpg {out_dir}/{scene}_{name}.jpg
+            plot_images([map_viz])
+            plt.gca().images[0].set_interpolation("none")
+            plot_nodes(0, rasters[2])
+            plot_pose([0], uv_gt, yaw_gt, c="red")
+            plot_pose([0], pred["uv_max"], pred["yaw_max"], c="k")
+            save_plot(p.format("map"))
+            plt.close()
+            plot_images([lp_uv], cmaps="jet")
+            plot_dense_rotations(0, lp_uvt.exp())
+            save_plot(p.format("loglikelihood"), dpi=100)
+            plt.close()
+            plot_images([overlay])
+            plt.gca().images[0].set_interpolation("none")
+            axins = add_circle_inset(plt.gca(), inset_center)
+            axins.scatter(*uv_gt, lw=1, c="red", ec="k", s=50)
+            save_plot(p.format("likelihood"))
+            plt.close()
+            write_torch_image(
+                p.format("neuralmap").replace("pdf", "jpg"), feats_map_rgb
+            )
+            write_torch_image(p.format("image").replace("pdf", "jpg"), image.numpy())
+    if not plot_bev:
+        return
+    feats_q = pred["features_bev"]
+    mask_bev = pred["valid_bev"]
+    prior = None
+    if "log_prior" in pred["map"]:
+        prior = pred["map"]["log_prior"][0].sigmoid()
+    if "bev" in pred and "confidence" in pred["bev"]:
+        conf_q = pred["bev"]["confidence"]
+    else:
+        conf_q = torch.norm(feats_q, dim=0)
+    conf_q = conf_q.masked_fill(~mask_bev, np.nan)
+    (feats_q_rgb,) = features_to_RGB(feats_q.numpy(), masks=[mask_bev.numpy()])
+    # feats_map_rgb, feats_q_rgb, = features_to_RGB(
+    #     feats_map.numpy(), feats_q.numpy(), masks=[None, mask_bev])
+    norm_map = torch.norm(feats_map, dim=0)
+    plot_images(
+        [conf_q, feats_q_rgb, norm_map] + ([] if prior is None else [prior]),
+        titles=["BEV confidence", "BEV features", "map norm"]
+        + ([] if prior is None else ["map prior"]),
+        dpi=50,
+        cmaps="jet",
+    )
+    plt.show()
+    if out_dir is not None:
+        save_plot(p.format("bev"))
+        plt.close()
+def plot_example_sequential(
+    idx,
+    model,
+    pred,
+    data,
+    results,
+    plot_bev=True,
+    out_dir=None,
+    fig_for_paper=False,
+    show_gps=False,
+    show_fused=False,
+    show_dir_error=False,
+    show_masked_prob=False,
+):
+    return

maploc/models/__init__.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+# https://github.com/cvg/pixloc
+# Released under the Apache License 2.0
+import inspect
+from .base import BaseModel
+def get_class(mod_name, base_path, BaseClass):
+    """Get the class object which inherits from BaseClass and is defined in
+    the module named mod_name, child of base_path.
+    """
+    mod_path = "{}.{}".format(base_path, mod_name)
+    mod = __import__(mod_path, fromlist=[""])
+    classes = inspect.getmembers(mod, inspect.isclass)
+    # Filter classes defined in the module
+    classes = [c for c in classes if c[1].__module__ == mod_path]
+    # Filter classes inherited from BaseModel
+    classes = [c for c in classes if issubclass(c[1], BaseClass)]
+    assert len(classes) == 1, classes
+    return classes[0][1]
+def get_model(name):
+    if name == "localizer":
+        name = "localizer_basic"
+    elif name == "rotation_localizer":
+        name = "localizer_basic_rotation"
+    elif name == "bev_localizer":
+        name = "localizer_bev_plane"
+    return get_class(name, __name__, BaseModel)

maploc/models/base.py ADDED Viewed

	@@ -0,0 +1,123 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+# https://github.com/cvg/pixloc
+# Released under the Apache License 2.0
+"""
+Base class for trainable models.
+"""
+from abc import ABCMeta, abstractmethod
+from copy import copy
+import omegaconf
+from omegaconf import OmegaConf
+from torch import nn
+class BaseModel(nn.Module, metaclass=ABCMeta):
+    """
+    What the child model is expect to declare:
+        default_conf: dictionary of the default configuration of the model.
+        It recursively updates the default_conf of all parent classes, and
+        it is updated by the user-provided configuration passed to __init__.
+        Configurations can be nested.
+        required_data_keys: list of expected keys in the input data dictionary.
+        strict_conf (optional): boolean. If false, BaseModel does not raise
+        an error when the user provides an unknown configuration entry.
+        _init(self, conf): initialization method, where conf is the final
+        configuration object (also accessible with `self.conf`). Accessing
+        unknown configuration entries will raise an error.
+        _forward(self, data): method that returns a dictionary of batched
+        prediction tensors based on a dictionary of batched input data tensors.
+        loss(self, pred, data): method that returns a dictionary of losses,
+        computed from model predictions and input data. Each loss is a batch
+        of scalars, i.e. a torch.Tensor of shape (B,).
+        The total loss to be optimized has the key `'total'`.
+        metrics(self, pred, data): method that returns a dictionary of metrics,
+        each as a batch of scalars.
+    """
+    base_default_conf = {
+        "name": None,
+        "trainable": True,  # if false: do not optimize this model parameters
+        "freeze_batch_normalization": False,  # use test-time statistics
+    }
+    default_conf = {}
+    required_data_keys = []
+    strict_conf = True
+    def __init__(self, conf):
+        """Perform some logic and call the _init method of the child model."""
+        super().__init__()
+        default_conf = OmegaConf.merge(
+            self.base_default_conf, OmegaConf.create(self.default_conf)
+        )
+        if self.strict_conf:
+            OmegaConf.set_struct(default_conf, True)
+        # fixme: backward compatibility
+        if "pad" in conf and "pad" not in default_conf:  # backward compat.
+            with omegaconf.read_write(conf):
+                with omegaconf.open_dict(conf):
+                    conf["interpolation"] = {"pad": conf.pop("pad")}
+        if isinstance(conf, dict):
+            conf = OmegaConf.create(conf)
+        self.conf = conf = OmegaConf.merge(default_conf, conf)
+        OmegaConf.set_readonly(conf, True)
+        OmegaConf.set_struct(conf, True)
+        self.required_data_keys = copy(self.required_data_keys)
+        self._init(conf)
+        if not conf.trainable:
+            for p in self.parameters():
+                p.requires_grad = False
+    def train(self, mode=True):
+        super().train(mode)
+        def freeze_bn(module):
+            if isinstance(module, nn.modules.batchnorm._BatchNorm):
+                module.eval()
+        if self.conf.freeze_batch_normalization:
+            self.apply(freeze_bn)
+        return self
+    def forward(self, data):
+        """Check the data and call the _forward method of the child model."""
+        def recursive_key_check(expected, given):
+            for key in expected:
+                assert key in given, f"Missing key {key} in data"
+                if isinstance(expected, dict):
+                    recursive_key_check(expected[key], given[key])
+        recursive_key_check(self.required_data_keys, data)
+        return self._forward(data)
+    @abstractmethod
+    def _init(self, conf):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    @abstractmethod
+    def _forward(self, data):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    def loss(self, pred, data):
+        """To be implemented by the child class."""
+        raise NotImplementedError
+    def metrics(self):
+        return {}  # no metrics

maploc/models/bev_net.py ADDED Viewed

	@@ -0,0 +1,61 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch.nn as nn
+from torchvision.models.resnet import Bottleneck
+from .base import BaseModel
+from .feature_extractor import AdaptationBlock
+from .utils import checkpointed
+class BEVNet(BaseModel):
+    default_conf = {
+        "pretrained": True,
+        "num_blocks": "???",
+        "latent_dim": "???",
+        "input_dim": "${.latent_dim}",
+        "output_dim": "${.latent_dim}",
+        "confidence": False,
+        "norm_layer": "nn.BatchNorm2d",  # normalization ind decoder blocks
+        "checkpointed": False,  # whether to use gradient checkpointing
+        "padding": "zeros",
+    }
+    def _init(self, conf):
+        blocks = []
+        Block = checkpointed(Bottleneck, do=conf.checkpointed)
+        for i in range(conf.num_blocks):
+            dim = conf.input_dim if i == 0 else conf.latent_dim
+            blocks.append(
+                Block(
+                    dim,
+                    conf.latent_dim // Bottleneck.expansion,
+                    norm_layer=eval(conf.norm_layer),
+                )
+            )
+        self.blocks = nn.Sequential(*blocks)
+        self.output_layer = AdaptationBlock(conf.latent_dim, conf.output_dim)
+        if conf.confidence:
+            self.confidence_layer = AdaptationBlock(conf.latent_dim, 1)
+        def update_padding(module):
+            if isinstance(module, nn.Conv2d):
+                module.padding_mode = conf.padding
+        if conf.padding != "zeros":
+            self.bocks.apply(update_padding)
+    def _forward(self, data):
+        features = self.blocks(data["input"])
+        pred = {
+            "output": self.output_layer(features),
+        }
+        if self.conf.confidence:
+            pred["confidence"] = self.confidence_layer(features).squeeze(1).sigmoid()
+        return pred
+    def loss(self, pred, data):
+        raise NotImplementedError
+    def metrics(self, pred, data):
+        raise NotImplementedError

maploc/models/bev_projection.py ADDED Viewed

	@@ -0,0 +1,91 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+from torch.nn.functional import grid_sample
+from ..utils.geometry import from_homogeneous
+from .utils import make_grid
+class PolarProjectionDepth(torch.nn.Module):
+    def __init__(self, z_max, ppm, scale_range, z_min=None):
+        super().__init__()
+        self.z_max = z_max
+        self.Δ = Δ = 1 / ppm
+        self.z_min = z_min = Δ if z_min is None else z_min
+        self.scale_range = scale_range
+        z_steps = torch.arange(z_min, z_max + Δ, Δ)
+        self.register_buffer("depth_steps", z_steps, persistent=False)
+    def sample_depth_scores(self, pixel_scales, camera):
+        scale_steps = camera.f[..., None, 1] / self.depth_steps.flip(-1)
+        log_scale_steps = torch.log2(scale_steps)
+        scale_min, scale_max = self.scale_range
+        log_scale_norm = (log_scale_steps - scale_min) / (scale_max - scale_min)
+        log_scale_norm = log_scale_norm * 2 - 1  # in [-1, 1]
+        values = pixel_scales.flatten(1, 2).unsqueeze(-1)
+        indices = log_scale_norm.unsqueeze(-1)
+        indices = torch.stack([torch.zeros_like(indices), indices], -1)
+        depth_scores = grid_sample(values, indices, align_corners=True)
+        depth_scores = depth_scores.reshape(
+            pixel_scales.shape[:-1] + (len(self.depth_steps),)
+        )
+        return depth_scores
+    def forward(
+        self,
+        image,
+        pixel_scales,
+        camera,
+        return_total_score=False,
+    ):
+        depth_scores = self.sample_depth_scores(pixel_scales, camera)
+        depth_prob = torch.softmax(depth_scores, dim=1)
+        image_polar = torch.einsum("...dhw,...hwz->...dzw", image, depth_prob)
+        if return_total_score:
+            cell_score = torch.logsumexp(depth_scores, dim=1, keepdim=True)
+            return image_polar, cell_score.squeeze(1)
+        return image_polar
+class CartesianProjection(torch.nn.Module):
+    def __init__(self, z_max, x_max, ppm, z_min=None):
+        super().__init__()
+        self.z_max = z_max
+        self.x_max = x_max
+        self.Δ = Δ = 1 / ppm
+        self.z_min = z_min = Δ if z_min is None else z_min
+        grid_xz = make_grid(
+            x_max * 2 + Δ, z_max, step_y=Δ, step_x=Δ, orig_y=Δ, orig_x=-x_max, y_up=True
+        )
+        self.register_buffer("grid_xz", grid_xz, persistent=False)
+    def grid_to_polar(self, cam):
+        f, c = cam.f[..., 0][..., None, None], cam.c[..., 0][..., None, None]
+        u = from_homogeneous(self.grid_xz).squeeze(-1) * f + c
+        z_idx = (self.grid_xz[..., 1] - self.z_min) / self.Δ  # convert z value to index
+        z_idx = z_idx[None].expand_as(u)
+        grid_polar = torch.stack([u, z_idx], -1)
+        return grid_polar
+    def sample_from_polar(self, image_polar, valid_polar, grid_uz):
+        size = grid_uz.new_tensor(image_polar.shape[-2:][::-1])
+        grid_uz_norm = (grid_uz + 0.5) / size * 2 - 1
+        grid_uz_norm = grid_uz_norm * grid_uz.new_tensor([1, -1])  # y axis is up
+        image_bev = grid_sample(image_polar, grid_uz_norm, align_corners=False)
+        if valid_polar is None:
+            valid = torch.ones_like(image_polar[..., :1, :, :])
+        else:
+            valid = valid_polar.to(image_polar)[:, None]
+        valid = grid_sample(valid, grid_uz_norm, align_corners=False)
+        valid = valid.squeeze(1) > (1 - 1e-4)
+        return image_bev, valid
+    def forward(self, image_polar, valid_polar, cam):
+        grid_uz = self.grid_to_polar(cam)
+        image, valid = self.sample_from_polar(image_polar, valid_polar, grid_uz)
+        return image, valid, grid_uz

maploc/models/feature_extractor.py ADDED Viewed

	@@ -0,0 +1,231 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# Adapted from PixLoc, Paul-Edouard Sarlin, ETH Zurich
+# https://github.com/cvg/pixloc
+# Released under the Apache License 2.0
+"""
+Flexible UNet model which takes any Torchvision backbone as encoder.
+Predicts multi-level feature and makes sure that they are well aligned.
+"""
+import torch
+import torch.nn as nn
+import torchvision
+from .base import BaseModel
+from .utils import checkpointed
+class DecoderBlock(nn.Module):
+    def __init__(
+        self, previous, skip, out, num_convs=1, norm=nn.BatchNorm2d, padding="zeros"
+    ):
+        super().__init__()
+        self.upsample = nn.Upsample(
+            scale_factor=2, mode="bilinear", align_corners=False
+        )
+        layers = []
+        for i in range(num_convs):
+            conv = nn.Conv2d(
+                previous + skip if i == 0 else out,
+                out,
+                kernel_size=3,
+                padding=1,
+                bias=norm is None,
+                padding_mode=padding,
+            )
+            layers.append(conv)
+            if norm is not None:
+                layers.append(norm(out))
+            layers.append(nn.ReLU(inplace=True))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, previous, skip):
+        upsampled = self.upsample(previous)
+        # If the shape of the input map `skip` is not a multiple of 2,
+        # it will not match the shape of the upsampled map `upsampled`.
+        # If the downsampling uses ceil_mode=False, we nedd to crop `skip`.
+        # If it uses ceil_mode=True (not supported here), we should pad it.
+        _, _, hu, wu = upsampled.shape
+        _, _, hs, ws = skip.shape
+        assert (hu <= hs) and (wu <= ws), "Using ceil_mode=True in pooling?"
+        # assert (hu == hs) and (wu == ws), 'Careful about padding'
+        skip = skip[:, :, :hu, :wu]
+        return self.layers(torch.cat([upsampled, skip], dim=1))
+class AdaptationBlock(nn.Sequential):
+    def __init__(self, inp, out):
+        conv = nn.Conv2d(inp, out, kernel_size=1, padding=0, bias=True)
+        super().__init__(conv)
+class FeatureExtractor(BaseModel):
+    default_conf = {
+        "pretrained": True,
+        "input_dim": 3,
+        "output_scales": [0, 2, 4],  # what scales to adapt and output
+        "output_dim": 128,  # # of channels in output feature maps
+        "encoder": "vgg16",  # string (torchvision net) or list of channels
+        "num_downsample": 4,  # how many downsample block (if VGG-style net)
+        "decoder": [64, 64, 64, 64],  # list of channels of decoder
+        "decoder_norm": "nn.BatchNorm2d",  # normalization ind decoder blocks
+        "do_average_pooling": False,
+        "checkpointed": False,  # whether to use gradient checkpointing
+        "padding": "zeros",
+    }
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    def build_encoder(self, conf):
+        assert isinstance(conf.encoder, str)
+        if conf.pretrained:
+            assert conf.input_dim == 3
+        Encoder = getattr(torchvision.models, conf.encoder)
+        encoder = Encoder(weights="DEFAULT" if conf.pretrained else None)
+        Block = checkpointed(torch.nn.Sequential, do=conf.checkpointed)
+        assert max(conf.output_scales) <= conf.num_downsample
+        if conf.encoder.startswith("vgg"):
+            # Parse the layers and pack them into downsampling blocks
+            # It's easy for VGG-style nets because of their linear structure.
+            # This does not handle strided convs and residual connections
+            skip_dims = []
+            previous_dim = None
+            blocks = [[]]
+            for i, layer in enumerate(encoder.features):
+                if isinstance(layer, torch.nn.Conv2d):
+                    # Change the first conv layer if the input dim mismatches
+                    if i == 0 and conf.input_dim != layer.in_channels:
+                        args = {k: getattr(layer, k) for k in layer.__constants__}
+                        args.pop("output_padding")
+                        layer = torch.nn.Conv2d(
+                            **{**args, "in_channels": conf.input_dim}
+                        )
+                    previous_dim = layer.out_channels
+                elif isinstance(layer, torch.nn.MaxPool2d):
+                    assert previous_dim is not None
+                    skip_dims.append(previous_dim)
+                    if (conf.num_downsample + 1) == len(blocks):
+                        break
+                    blocks.append([])  # start a new block
+                    if conf.do_average_pooling:
+                        assert layer.dilation == 1
+                        layer = torch.nn.AvgPool2d(
+                            kernel_size=layer.kernel_size,
+                            stride=layer.stride,
+                            padding=layer.padding,
+                            ceil_mode=layer.ceil_mode,
+                            count_include_pad=False,
+                        )
+                blocks[-1].append(layer)
+            encoder = [Block(*b) for b in blocks]
+        elif conf.encoder.startswith("resnet"):
+            # Manually define the ResNet blocks such that the downsampling comes first
+            assert conf.encoder[len("resnet") :] in ["18", "34", "50", "101"]
+            assert conf.input_dim == 3, "Unsupported for now."
+            block1 = torch.nn.Sequential(encoder.conv1, encoder.bn1, encoder.relu)
+            block2 = torch.nn.Sequential(encoder.maxpool, encoder.layer1)
+            block3 = encoder.layer2
+            block4 = encoder.layer3
+            block5 = encoder.layer4
+            blocks = [block1, block2, block3, block4, block5]
+            # Extract the output dimension of each block
+            skip_dims = [encoder.conv1.out_channels]
+            for i in range(1, 5):
+                modules = getattr(encoder, f"layer{i}")[-1]._modules
+                conv = sorted(k for k in modules if k.startswith("conv"))[-1]
+                skip_dims.append(modules[conv].out_channels)
+            # Add a dummy block such that the first one does not downsample
+            encoder = [torch.nn.Identity()] + [Block(b) for b in blocks]
+            skip_dims = [3] + skip_dims
+            # Trim based on the requested encoder size
+            encoder = encoder[: conf.num_downsample + 1]
+            skip_dims = skip_dims[: conf.num_downsample + 1]
+        else:
+            raise NotImplementedError(conf.encoder)
+        assert (conf.num_downsample + 1) == len(encoder)
+        encoder = nn.ModuleList(encoder)
+        return encoder, skip_dims
+    def _init(self, conf):
+        # Encoder
+        self.encoder, skip_dims = self.build_encoder(conf)
+        self.skip_dims = skip_dims
+        def update_padding(module):
+            if isinstance(module, nn.Conv2d):
+                module.padding_mode = conf.padding
+        if conf.padding != "zeros":
+            self.encoder.apply(update_padding)
+        # Decoder
+        if conf.decoder is not None:
+            assert len(conf.decoder) == (len(skip_dims) - 1)
+            Block = checkpointed(DecoderBlock, do=conf.checkpointed)
+            norm = eval(conf.decoder_norm) if conf.decoder_norm else None  # noqa
+            previous = skip_dims[-1]
+            decoder = []
+            for out, skip in zip(conf.decoder, skip_dims[:-1][::-1]):
+                decoder.append(
+                    Block(previous, skip, out, norm=norm, padding=conf.padding)
+                )
+                previous = out
+            self.decoder = nn.ModuleList(decoder)
+        # Adaptation layers
+        adaptation = []
+        for idx, i in enumerate(conf.output_scales):
+            if conf.decoder is None or i == (len(self.encoder) - 1):
+                input_ = skip_dims[i]
+            else:
+                input_ = conf.decoder[-1 - i]
+            # out_dim can be an int (same for all scales) or a list (per scale)
+            dim = conf.output_dim
+            if not isinstance(dim, int):
+                dim = dim[idx]
+            block = AdaptationBlock(input_, dim)
+            adaptation.append(block)
+        self.adaptation = nn.ModuleList(adaptation)
+        self.scales = [2**s for s in conf.output_scales]
+    def _forward(self, data):
+        image = data["image"]
+        if self.conf.pretrained:
+            mean, std = image.new_tensor(self.mean), image.new_tensor(self.std)
+            image = (image - mean[:, None, None]) / std[:, None, None]
+        skip_features = []
+        features = image
+        for block in self.encoder:
+            features = block(features)
+            skip_features.append(features)
+        if self.conf.decoder:
+            pre_features = [skip_features[-1]]
+            for block, skip in zip(self.decoder, skip_features[:-1][::-1]):
+                pre_features.append(block(pre_features[-1], skip))
+            pre_features = pre_features[::-1]  # fine to coarse
+        else:
+            pre_features = skip_features
+        out_features = []
+        for adapt, i in zip(self.adaptation, self.conf.output_scales):
+            out_features.append(adapt(pre_features[i]))
+        pred = {"feature_maps": out_features, "skip_features": skip_features}
+        return pred
+    def loss(self, pred, data):
+        raise NotImplementedError
+    def metrics(self, pred, data):
+        raise NotImplementedError

maploc/models/feature_extractor_v2.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import logging
+import numpy as np
+import torch
+import torch.nn as nn
+import torchvision
+from torchvision.models.feature_extraction import create_feature_extractor
+from .base import BaseModel
+logger = logging.getLogger(__name__)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self, previous, out, ksize=3, num_convs=1, norm=nn.BatchNorm2d, padding="zeros"
+    ):
+        super().__init__()
+        layers = []
+        for i in range(num_convs):
+            conv = nn.Conv2d(
+                previous if i == 0 else out,
+                out,
+                kernel_size=ksize,
+                padding=ksize // 2,
+                bias=norm is None,
+                padding_mode=padding,
+            )
+            layers.append(conv)
+            if norm is not None:
+                layers.append(norm(out))
+            layers.append(nn.ReLU(inplace=True))
+        self.layers = nn.Sequential(*layers)
+    def forward(self, previous, skip):
+        _, _, hp, wp = previous.shape
+        _, _, hs, ws = skip.shape
+        scale = 2 ** np.round(np.log2(np.array([hs / hp, ws / wp])))
+        upsampled = nn.functional.interpolate(
+            previous, scale_factor=scale.tolist(), mode="bilinear", align_corners=False
+        )
+        # If the shape of the input map `skip` is not a multiple of 2,
+        # it will not match the shape of the upsampled map `upsampled`.
+        # If the downsampling uses ceil_mode=False, we nedd to crop `skip`.
+        # If it uses ceil_mode=True (not supported here), we should pad it.
+        _, _, hu, wu = upsampled.shape
+        _, _, hs, ws = skip.shape
+        if (hu <= hs) and (wu <= ws):
+            skip = skip[:, :, :hu, :wu]
+        elif (hu >= hs) and (wu >= ws):
+            skip = nn.functional.pad(skip, [0, wu - ws, 0, hu - hs])
+        else:
+            raise ValueError(
+                f"Inconsistent skip vs upsampled shapes: {(hs, ws)}, {(hu, wu)}"
+            )
+        return self.layers(skip) + upsampled
+class FPN(nn.Module):
+    def __init__(self, in_channels_list, out_channels, **kw):
+        super().__init__()
+        self.first = nn.Conv2d(
+            in_channels_list[-1], out_channels, 1, padding=0, bias=True
+        )
+        self.blocks = nn.ModuleList(
+            [
+                DecoderBlock(c, out_channels, ksize=1, **kw)
+                for c in in_channels_list[::-1][1:]
+            ]
+        )
+        self.out = nn.Sequential(
+            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
+            nn.BatchNorm2d(out_channels),
+            nn.ReLU(inplace=True),
+        )
+    def forward(self, layers):
+        feats = None
+        for idx, x in enumerate(reversed(layers.values())):
+            if feats is None:
+                feats = self.first(x)
+            else:
+                feats = self.blocks[idx - 1](feats, x)
+        out = self.out(feats)
+        return out
+def remove_conv_stride(conv):
+    conv_new = nn.Conv2d(
+        conv.in_channels,
+        conv.out_channels,
+        conv.kernel_size,
+        bias=conv.bias is not None,
+        stride=1,
+        padding=conv.padding,
+    )
+    conv_new.weight = conv.weight
+    conv_new.bias = conv.bias
+    return conv_new
+class FeatureExtractor(BaseModel):
+    default_conf = {
+        "pretrained": True,
+        "input_dim": 3,
+        "output_dim": 128,  # # of channels in output feature maps
+        "encoder": "resnet50",  # torchvision net as string
+        "remove_stride_from_first_conv": False,
+        "num_downsample": None,  # how many downsample block
+        "decoder_norm": "nn.BatchNorm2d",  # normalization ind decoder blocks
+        "do_average_pooling": False,
+        "checkpointed": False,  # whether to use gradient checkpointing
+    }
+    mean = [0.485, 0.456, 0.406]
+    std = [0.229, 0.224, 0.225]
+    def build_encoder(self, conf):
+        assert isinstance(conf.encoder, str)
+        if conf.pretrained:
+            assert conf.input_dim == 3
+        Encoder = getattr(torchvision.models, conf.encoder)
+        kw = {}
+        if conf.encoder.startswith("resnet"):
+            layers = ["relu", "layer1", "layer2", "layer3", "layer4"]
+            kw["replace_stride_with_dilation"] = [False, False, False]
+        elif conf.encoder == "vgg13":
+            layers = [
+                "features.3",
+                "features.8",
+                "features.13",
+                "features.18",
+                "features.23",
+            ]
+        elif conf.encoder == "vgg16":
+            layers = [
+                "features.3",
+                "features.8",
+                "features.15",
+                "features.22",
+                "features.29",
+            ]
+        else:
+            raise NotImplementedError(conf.encoder)
+        if conf.num_downsample is not None:
+            layers = layers[: conf.num_downsample]
+        encoder = Encoder(weights="DEFAULT" if conf.pretrained else None, **kw)
+        encoder = create_feature_extractor(encoder, return_nodes=layers)
+        if conf.encoder.startswith("resnet") and conf.remove_stride_from_first_conv:
+            encoder.conv1 = remove_conv_stride(encoder.conv1)
+        if conf.do_average_pooling:
+            raise NotImplementedError
+        if conf.checkpointed:
+            raise NotImplementedError
+        return encoder, layers
+    def _init(self, conf):
+        # Preprocessing
+        self.register_buffer("mean_", torch.tensor(self.mean), persistent=False)
+        self.register_buffer("std_", torch.tensor(self.std), persistent=False)
+        # Encoder
+        self.encoder, self.layers = self.build_encoder(conf)
+        s = 128
+        inp = torch.zeros(1, 3, s, s)
+        features = list(self.encoder(inp).values())
+        self.skip_dims = [x.shape[1] for x in features]
+        self.layer_strides = [s / f.shape[-1] for f in features]
+        self.scales = [self.layer_strides[0]]
+        # Decoder
+        norm = eval(conf.decoder_norm) if conf.decoder_norm else None  # noqa
+        self.decoder = FPN(self.skip_dims, out_channels=conf.output_dim, norm=norm)
+        logger.debug(
+            "Built feature extractor with layers {name:dim:stride}:\n"
+            f"{list(zip(self.layers, self.skip_dims, self.layer_strides))}\n"
+            f"and output scales {self.scales}."
+        )
+    def _forward(self, data):
+        image = data["image"]
+        image = (image - self.mean_[:, None, None]) / self.std_[:, None, None]
+        skip_features = self.encoder(image)
+        output = self.decoder(skip_features)
+        pred = {"feature_maps": [output], "skip_features": skip_features}
+        return pred

maploc/models/map_encoder.py ADDED Viewed

	@@ -0,0 +1,66 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+import torch
+import torch.nn as nn
+from .base import BaseModel
+from .feature_extractor import FeatureExtractor
+class MapEncoder(BaseModel):
+    default_conf = {
+        "embedding_dim": "???",
+        "output_dim": None,
+        "num_classes": "???",
+        "backbone": "???",
+        "unary_prior": False,
+    }
+    def _init(self, conf):
+        self.embeddings = torch.nn.ModuleDict(
+            {
+                k: torch.nn.Embedding(n + 1, conf.embedding_dim)
+                for k, n in conf.num_classes.items()
+            }
+        )
+        input_dim = len(conf.num_classes) * conf.embedding_dim
+        output_dim = conf.output_dim
+        if output_dim is None:
+            output_dim = conf.backbone.output_dim
+        if conf.unary_prior:
+            output_dim += 1
+        if conf.backbone is None:
+            self.encoder = nn.Conv2d(input_dim, output_dim, 1)
+        elif conf.backbone == "simple":
+            self.encoder = nn.Sequential(
+                nn.Conv2d(input_dim, 128, 3, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(128, 128, 3, padding=1),
+                nn.ReLU(inplace=True),
+                nn.Conv2d(128, output_dim, 3, padding=1),
+            )
+        else:
+            self.encoder = FeatureExtractor(
+                {
+                    **conf.backbone,
+                    "input_dim": input_dim,
+                    "output_dim": output_dim,
+                }
+            )
+    def _forward(self, data):
+        embeddings = [
+            self.embeddings[k](data["map"][:, i])
+            for i, k in enumerate(("areas", "ways", "nodes"))
+        ]
+        embeddings = torch.cat(embeddings, dim=-1).permute(0, 3, 1, 2)
+        if isinstance(self.encoder, BaseModel):
+            features = self.encoder({"image": embeddings})["feature_maps"]
+        else:
+            features = [self.encoder(embeddings)]
+        pred = {}
+        if self.conf.unary_prior:
+            pred["log_prior"] = [f[:, -1] for f in features]
+            features = [f[:, :-1] for f in features]
+        pred["map_features"] = features
+        return pred