Spaces:

fffiloni
/

VACE-Annotators

Paused

App Files Files Community

fffiloni commited on May 15

Commit

c7de15e

verified ·

1 Parent(s): 8c9dae5

Migrated from GitHub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +9 -0
LICENSE.txt +201 -0
ORIGINAL_README.md +189 -0
UserGuide.md +160 -0
__init__.py +0 -0
assets/images/girl.png +3 -0
assets/images/snake.png +3 -0
assets/images/test.jpg +3 -0
assets/images/test2.jpg +0 -0
assets/images/test3.jpg +3 -0
assets/masks/test.png +0 -0
assets/masks/test2.png +0 -0
assets/materials/gr_infer_demo.jpg +3 -0
assets/materials/gr_pre_demo.jpg +3 -0
assets/materials/tasks.png +3 -0
assets/materials/teaser.jpg +3 -0
assets/videos/test.mp4 +3 -0
assets/videos/test2.mp4 +0 -0
pyproject.toml +75 -0
requirements.txt +1 -0
requirements/annotator.txt +6 -0
requirements/framework.txt +26 -0
run_vace_ltx.sh +48 -0
run_vace_pipeline.sh +27 -0
run_vace_preproccess.sh +58 -0
run_vace_wan.sh +48 -0
tests/test_annotators.py +568 -0
vace/__init__.py +6 -0
vace/annotators/__init__.py +24 -0
vace/annotators/canvas.py +60 -0
vace/annotators/common.py +62 -0
vace/annotators/composition.py +155 -0
vace/annotators/depth.py +88 -0
vace/annotators/depth_anything_v2/__init__.py +0 -0
vace/annotators/depth_anything_v2/dinov2.py +414 -0
vace/annotators/depth_anything_v2/dpt.py +210 -0
vace/annotators/depth_anything_v2/layers/__init__.py +11 -0
vace/annotators/depth_anything_v2/layers/attention.py +79 -0
vace/annotators/depth_anything_v2/layers/block.py +252 -0
vace/annotators/depth_anything_v2/layers/drop_path.py +34 -0
vace/annotators/depth_anything_v2/layers/layer_scale.py +28 -0
vace/annotators/depth_anything_v2/layers/mlp.py +39 -0
vace/annotators/depth_anything_v2/layers/patch_embed.py +90 -0
vace/annotators/depth_anything_v2/layers/swiglu_ffn.py +64 -0
vace/annotators/depth_anything_v2/util/__init__.py +0 -0
vace/annotators/depth_anything_v2/util/blocks.py +151 -0
vace/annotators/depth_anything_v2/util/transform.py +159 -0
vace/annotators/dwpose/__init__.py +2 -0
vace/annotators/dwpose/onnxdet.py +127 -0
vace/annotators/dwpose/onnxpose.py +362 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/images/girl.png filter=lfs diff=lfs merge=lfs -text
+assets/images/snake.png filter=lfs diff=lfs merge=lfs -text
+assets/images/test.jpg filter=lfs diff=lfs merge=lfs -text
+assets/images/test3.jpg filter=lfs diff=lfs merge=lfs -text
+assets/materials/gr_infer_demo.jpg filter=lfs diff=lfs merge=lfs -text
+assets/materials/gr_pre_demo.jpg filter=lfs diff=lfs merge=lfs -text
+assets/materials/tasks.png filter=lfs diff=lfs merge=lfs -text
+assets/materials/teaser.jpg filter=lfs diff=lfs merge=lfs -text
+assets/videos/test.mp4 filter=lfs diff=lfs merge=lfs -text

LICENSE.txt ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

ORIGINAL_README.md ADDED Viewed

	@@ -0,0 +1,189 @@

+<p align="center">
+<h1 align="center">VACE: All-in-One Video Creation and Editing</h1>
+<p align="center">
+    <strong>Zeyinzi Jiang<sup>*</sup></strong>
+    ·
+    <strong>Zhen Han<sup>*</sup></strong>
+    ·
+    <strong>Chaojie Mao<sup>*&dagger;</sup></strong>
+    ·
+    <strong>Jingfeng Zhang</strong>
+    ·
+    <strong>Yulin Pan</strong>
+    ·
+    <strong>Yu Liu</strong>
+    <br>
+    <b>Tongyi Lab - <a href="https://github.com/Wan-Video/Wan2.1"><img src='https://ali-vilab.github.io/VACE-Page/assets/logos/wan_logo.png' alt='wan_logo' style='margin-bottom: -4px; height: 20px;'></a> </b>
+    <br>
+    <br>
+        <a href="https://arxiv.org/abs/2503.07598"><img src='https://img.shields.io/badge/VACE-arXiv-red' alt='Paper PDF'></a>
+        <a href="https://ali-vilab.github.io/VACE-Page/"><img src='https://img.shields.io/badge/VACE-Project_Page-green' alt='Project Page'></a>
+        <a href="https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38"><img src='https://img.shields.io/badge/VACE-HuggingFace_Model-yellow'></a>
+        <a href="https://modelscope.cn/collections/VACE-8fa5fcfd386e43"><img src='https://img.shields.io/badge/VACE-ModelScope_Model-purple'></a>
+    <br>
+</p>
+## Introduction
+<strong>VACE</strong> is an all-in-one model designed for video creation and editing. It encompasses various tasks, including reference-to-video generation (<strong>R2V</strong>), video-to-video editing (<strong>V2V</strong>), and masked video-to-video editing (<strong>MV2V</strong>), allowing users to compose these tasks freely. This functionality enables users to explore diverse possibilities and streamlines their workflows effectively, offering a range of capabilities, such as Move-Anything, Swap-Anything, Reference-Anything, Expand-Anything, Animate-Anything, and more.
+<img src='./assets/materials/teaser.jpg'>
+## 🎉 News
+- [x] May 14, 2025: 🔥Wan2.1-VACE-1.3B and Wan2.1-VACE-14B models are now available at [HuggingFace](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) and [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)!
+- [x] Mar 31, 2025: 🔥VACE-Wan2.1-1.3B-Preview and VACE-LTX-Video-0.9 models are now available at [HuggingFace](https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38) and [ModelScope](https://modelscope.cn/collections/VACE-8fa5fcfd386e43)!
+- [x] Mar 31, 2025: 🔥Release code of model inference, preprocessing, and gradio demos.
+- [x] Mar 11, 2025: We propose [VACE](https://ali-vilab.github.io/VACE-Page/), an all-in-one model for video creation and editing.
+## 🪄 Models
+| Models                   | Download Link                                                                                                                                           | Video Size        | License                                                                                       |
+|--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|
+| VACE-Wan2.1-1.3B-Preview | [Huggingface](https://huggingface.co/ali-vilab/VACE-Wan2.1-1.3B-Preview) 🤗  [ModelScope](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview) 🤖 | ~ 81 x 480 x 832  | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/blob/main/LICENSE.txt)             |
+| VACE-LTX-Video-0.9       | [Huggingface](https://huggingface.co/ali-vilab/VACE-LTX-Video-0.9) 🤗     [ModelScope](https://modelscope.cn/models/iic/VACE-LTX-Video-0.9) 🤖          | ~ 97 x 512 x 768  | [RAIL-M](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.license.txt) |
+| Wan2.1-VACE-1.3B         | [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) 🤗     [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B) 🤖          | ~ 81 x 480 x 832  | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/blob/main/LICENSE.txt)             |
+| Wan2.1-VACE-14B          | [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) 🤗     [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B) 🤖            | ~ 81 x 720 x 1280 | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B/blob/main/LICENSE.txt)             |
+- The input supports any resolution, but to achieve optimal results, the video size should fall within a specific range.
+- All models inherit the license of the original model.
+## ⚙️ Installation
+The codebase was tested with Python 3.10.13, CUDA version 12.4, and PyTorch >= 2.5.1.
+### Setup for Model Inference
+You can setup for VACE model inference by running:
+```bash
+git clone https://github.com/ali-vilab/VACE.git && cd VACE
+pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124  # If PyTorch is not installed.
+pip install -r requirements.txt
+pip install wan@git+https://github.com/Wan-Video/Wan2.1  # If you want to use Wan2.1-based VACE.
+pip install ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1 sentencepiece --no-deps # If you want to use LTX-Video-0.9-based VACE. It may conflict with Wan.
+```
+Please download your preferred base model to `<repo-root>/models/`.
+### Setup for Preprocess Tools
+If you need preprocessing tools, please install:
+```bash
+pip install -r requirements/annotator.txt
+```
+Please download [VACE-Annotators](https://huggingface.co/ali-vilab/VACE-Annotators) to `<repo-root>/models/`.
+### Local Directories Setup
+It is recommended to download [VACE-Benchmark](https://huggingface.co/datasets/ali-vilab/VACE-Benchmark) to `<repo-root>/benchmarks/` as examples in `run_vace_xxx.sh`.
+We recommend to organize local directories as:
+```angular2html
+VACE
+├── ...
+├── benchmarks
+│   └── VACE-Benchmark
+│       └── assets
+│           └── examples
+│               ├── animate_anything
+│               │   └── ...
+│               └── ...
+├── models
+│   ├── VACE-Annotators
+│   │   └── ...
+│   ├── VACE-LTX-Video-0.9
+│   │   └── ...
+│   └── VACE-Wan2.1-1.3B-Preview
+│       └── ...
+└── ...
+```
+## 🚀 Usage
+In VACE, users can input **text prompt** and optional **video**, **mask**, and **image** for video generation or editing.
+Detailed instructions for using VACE can be found in the [User Guide](./UserGuide.md).
+### Inference CIL
+#### 1) End-to-End Running
+To simply run VACE without diving into any implementation details, we suggest an end-to-end pipeline. For example:
+```bash
+# run V2V depth
+python vace/vace_pipeline.py --base wan --task depth --video assets/videos/test.mp4 --prompt 'xxx'
+# run MV2V inpainting by providing bbox
+python vace/vace_pipeline.py --base wan --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4 --prompt 'xxx'
+```
+This script will run video preprocessing and model inference sequentially,
+and you need to specify all the required args of preprocessing (`--task`, `--mode`, `--bbox`, `--video`, etc.) and inference (`--prompt`, etc.).
+The output video together with intermediate video, mask and images will be saved into `./results/` by default.
+> 💡**Note**:
+> Please refer to [run_vace_pipeline.sh](./run_vace_pipeline.sh) for usage examples of different task pipelines.
+#### 2) Preprocessing
+To have more flexible control over the input, before VACE model inference, user inputs need to be preprocessed into `src_video`, `src_mask`, and `src_ref_images` first.
+We assign each [preprocessor](./vace/configs/__init__.py) a task name, so simply call [`vace_preprocess.py`](./vace/vace_preproccess.py) and specify the task name and task params. For example:
+```angular2html
+# process video depth
+python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
+# process video inpainting by providing bbox
+python vace/vace_preproccess.py --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4
+```
+The outputs will be saved to `./processed/` by default.
+> 💡**Note**:
+> Please refer to [run_vace_pipeline.sh](./run_vace_pipeline.sh) preprocessing methods for different tasks.
+Moreover, refer to [vace/configs/](./vace/configs/) for all the pre-defined tasks and required params.
+You can also customize preprocessors by implementing at [`annotators`](./vace/annotators/__init__.py) and register them at [`configs`](./vace/configs).
+#### 3) Model inference
+Using the input data obtained from **Preprocessing**, the model inference process can be performed as follows:
+```bash
+# For Wan2.1 single GPU inference (1.3B-480P)
+python vace/vace_wan_inference.py --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
+# For Wan2.1 Multi GPU Acceleration inference (1.3B-480P)
+pip install "xfuser>=0.4.1"
+torchrun --nproc_per_node=8 vace/vace_wan_inference.py --dit_fsdp --t5_fsdp --ulysses_size 1 --ring_size 8 --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
+# For Wan2.1 Multi GPU Acceleration inference (14B-720P)
+torchrun --nproc_per_node=8 vace/vace_wan_inference.py --dit_fsdp --t5_fsdp --ulysses_size 8 --ring_size 1 --size 720p --model_name 'vace-14B' --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
+# For LTX inference, run
+python vace/vace_ltx_inference.py --ckpt_path <path-to-model> --text_encoder_path <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
+```
+The output video together with intermediate video, mask and images will be saved into `./results/` by default.
+> 💡**Note**:
+> (1) Please refer to [vace/vace_wan_inference.py](./vace/vace_wan_inference.py) and [vace/vace_ltx_inference.py](./vace/vace_ltx_inference.py) for the inference args.
+> (2) For LTX-Video and English language Wan2.1 users, you need prompt extension to unlock the full model performance.
+Please follow the [instruction of Wan2.1](https://github.com/Wan-Video/Wan2.1?tab=readme-ov-file#2-using-prompt-extension) and set `--use_prompt_extend` while running inference.
+> (3) When performing prompt extension in editing tasks, it's important to pay attention to the results of expanding plain text. Since the visual information being input is unknown, this may lead to the extended output not matching the video being edited, which can affect the final outcome.
+### Inference Gradio
+For preprocessors, run
+```bash
+python vace/gradios/vace_preprocess_demo.py
+```
+For model inference, run
+```bash
+# For Wan2.1 gradio inference
+python vace/gradios/vace_wan_demo.py
+# For LTX gradio inference
+python vace/gradios/vace_ltx_demo.py
+```
+## Acknowledgement
+We are grateful for the following awesome projects, including [Scepter](https://github.com/modelscope/scepter), [Wan](https://github.com/Wan-Video/Wan2.1), and [LTX-Video](https://github.com/Lightricks/LTX-Video).
+## BibTeX
+```bibtex
+@article{vace,
+    title = {VACE: All-in-One Video Creation and Editing},
+    author = {Jiang, Zeyinzi and Han, Zhen and Mao, Chaojie and Zhang, Jingfeng and Pan, Yulin and Liu, Yu},
+    journal = {arXiv preprint arXiv:2503.07598},
+    year = {2025}
+}

UserGuide.md ADDED Viewed

	@@ -0,0 +1,160 @@

+# VACE User Guide
+## 1. Overall Steps
+- Preparation: Be aware of the task type ([single task](#32-single-task) or [multi-task composition](#33-composition-task)) of your creative idea, and prepare all the required materials (images, videos, prompt, etc.)
+- Preprocessing: Select the appropriate preprocessing method based task name, then preprocess your materials to meet the model's input requirements.
+- Inference: Based on the preprocessed materials, perform VACE inference to obtain results.
+## 2. Preparations
+### 2.1 Task Definition
+VACE, as a unified video generation solution, simultaneously supports Video Generation, Video Editing, and complex composition task. Specifically:
+- Video Generation: No video input. Injecting concepts into the model through semantic understanding based on text and reference materials, including **T2V** (Text-to-Video Generation) and **R2V** (Reference-to-Video Generation) tasks.
+- Video Editing: With video input. Modifying input video at the pixel level globally or locally,including **V2V** (Video-to-Video Editing) and **MV2V** (Masked Video-to-Video Editing).
+- Composition Task: Compose two or more single task above into a complex composition task, such as **Reference Anything** (Face R2V + Object R2V), **Move Anything**(Frame R2V + Layout V2V), **Animate Anything**(R2V + Pose V2V), **Swap Anything**(R2V + Inpainting MV2V), and **Expand Anything**(Object R2V + Frame R2V + Outpainting MV2V), etc.
+Single tasks and compositional tasks are illustrated in the diagram below:
+![vace_task](assets/materials/tasks.png)
+### 2.2 Limitations
+- Super high resolution video will be resized to proper spatial size.
+- Super long video will be trimmed or uniformly sampled into around 5 seconds.
+- For users who are demanding of long video generation, we recommend to generate 5s video clips one by one, while using `firstclip` video extension task to keep the temporal consistency.
+## 3. Preprocessing
+### 3.1 VACE-Recognizable Inputs
+User-collected materials needs to be preprocessed into VACE-recognizable inputs, including **`src_video`**, **`src_mask`**, **`src_ref_images`**, and **`prompt`**.
+Specific descriptions are as follows:
+- `src_video`: The video to be edited for input into the model, such as condition videos (Depth, Pose, etc.) or in/outpainting input video. **Gray areas**(values equal to 127) represent missing video part. In first-frame R2V task, the first frame are reference frame while subsequent frames are left gray. The missing parts of in/outpainting `src_video` are also set gray.
+- `src_mask`: A 3D mask in the same shape of `src_video`. **White areas** represent the parts to be generated, while **black areas** represent the parts to be retained.
+- `src_ref_images`: Reference images of R2V. Salient object segmentation can be performed to keep the background white.
+- `prompt`: A text describing the content of the output video. Prompt expansion can be used to achieve better generation effects for LTX-Video and English user of Wan2.1. Use descriptive prompt instead of instructions.
+Among them, `prompt` is required while `src_video`, `src_mask`, and `src_ref_images` are optional. For instance, MV2V task requires `src_video`, `src_mask`, and `prompt`; R2V task only requires `src_ref_images` and `prompt`.
+### 3.2 Preprocessing Tools
+Both command line and Gradio demo are supported.
+1) Command Line: You can refer to the `run_vace_preproccess.sh` script and invoke it based on the different task types. An example command is as follows:
+```bash
+python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
+```
+2) Gradio Interactive: Launch the graphical interface for data preprocessing and perform preprocessing on the interface. The specific command is as follows:
+```bash
+python vace/gradios/preprocess_demo.py
+```
+![gr_pre_demo](assets/materials/gr_pre_demo.jpg)
+### 3.2 Single Tasks
+VACE is an all-in-one model supporting various task types. However, different preprocessing is required for these task types. The specific task types and descriptions are as follows:
+| Task       | Subtask              | Annotator                  | Input modal                              | Params                                                                                                                                                                                                               | Note                                                   |
+|------------|----------------------|----------------------------|------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------|
+| txt2vid    | txt2vid              | /                          | /                                        | /                                                                                                                                                                                                                    |                                                        |
+| control    | depth                | DepthVideoAnnotator        | video                                    | /                                                                                                                                                                                                                    |                                                        |
+| control    | flow                 | FlowVisAnnotator           | video                                    | /                                                                                                                                                                                                                    |                                                        |
+| control    | gray                 | GrayVideoAnnotator         | video                                    | /                                                                                                                                                                                                                    |                                                        |
+| control    | pose                 | PoseBodyFaceVideoAnnotator | video                                    | /                                                                                                                                                                                                                    |                                                        |
+| control    | scribble             | ScribbleVideoAnnotator     | video                                    | /                                                                                                                                                                                                                    |                                                        |
+| control    | layout_bbox          | LayoutBboxAnnotator        | two bboxes <br>'x1,y1,x2,y2 x1,y1,x2,y2' | /                                                                                                                                                                                                                    | Move linearly from the first box to the second box     |
+| control    | layout_track         | LayoutTrackAnnotator       | video                                    | mode='masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0                                                     | Mode represents different methods of subject tracking. |
+| extension  | frameref             | FrameRefExpandAnnotator    | image                                    | mode='firstframe'<br>expand_num=80 (default)                                                                                                                                                                         |                                                        |
+| extension  | frameref             | FrameRefExpandAnnotator    | image                                    | mode='lastframe'<br>expand_num=80 (default)                                                                                                                                                                          |                                                        |
+| extension  | frameref             | FrameRefExpandAnnotator    | two images<br>a.jpg,b.jpg                | mode='firstlastframe'<br>expand_num=80 (default)                                                                                                                                                                     | Images are separated by commas.                        |
+| extension  | clipref              | FrameRefExpandAnnotator    | video                                    | mode='firstclip'<br>expand_num=80 (default)                                                                                                                                                                          |                                                        |
+| extension  | clipref              | FrameRefExpandAnnotator    | video                                    | mode='lastclip'<br>expand_num=80 (default)                                                                                                                                                                           |                                                        |
+| extension  | clipref              | FrameRefExpandAnnotator    | two videos<br>a.mp4,b.mp4                | mode='firstlastclip'<br>expand_num=80 (default)                                                                                                                                                                      | Videos are separated by commas.                        |
+| repainting | inpainting_mask      | InpaintingAnnotator        | video                                    | mode='salient'                                                                                                                                                                                                       | Use salient as a fixed mask.                           |
+| repainting | inpainting_mask      | InpaintingAnnotator        | video + mask                             | mode='mask'                                                                                                                                                                                                          | Use mask as a fixed mask.                              |
+| repainting | inpainting_bbox      | InpaintingAnnotator        | video + bbox<br>'x1, y1, x2, y2'         | mode='bbox'                                                                                                                                                                                                          | Use bbox as a fixed mask.                              |
+| repainting | inpainting_masktrack | InpaintingAnnotator        | video                                    | mode='salientmasktrack'                                                                                                                                                                                              | Use salient mask for dynamic tracking.                 |
+| repainting | inpainting_masktrack | InpaintingAnnotator        | video                                    | mode='salientbboxtrack'                                                                                                                                                                                              | Use salient bbox for dynamic tracking.                 |
+| repainting | inpainting_masktrack | InpaintingAnnotator        | video + mask                             | mode='masktrack'                                                                                                                                                                                                     | Use mask for dynamic tracking.                         |
+| repainting | inpainting_bboxtrack | InpaintingAnnotator        | video + bbox<br>'x1, y1, x2, y2'         | mode='bboxtrack'                                                                                                                                                                                                     | Use bbox for dynamic tracking.                         |
+| repainting | inpainting_label     | InpaintingAnnotator        | video + label                            | mode='label'                                                                                                                                                                                                         | Use label for dynamic tracking.                        |
+| repainting | inpainting_caption   | InpaintingAnnotator        | video + caption                          | mode='caption'                                                                                                                                                                                                       | Use caption for dynamic tracking.                      |
+| repainting | outpainting          | OutpaintingVideoAnnotator  | video                                    | direction=left/right/up/down<br>expand_ratio=0~1.0                                                                                                                                                                   | Combine outpainting directions arbitrarily.            |
+| reference  | image_reference      | SubjectAnnotator           | image                                    | mode='salient/mask/bbox/salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0 | Use different methods to obtain the subject region.    |
+### 3.3 Composition Task
+Moreover, VACE supports combining tasks to accomplish more complex objectives. The following examples illustrate how tasks can be combined, but these combinations are not limited to the examples provided:
+| Task        | Subtask            | Annotator                  | Input modal        | Params                                                                                                                                                           | Note                                                                                                                           |
+|-------------|--------------------|----------------------------|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
+| composition | reference_anything | ReferenceAnythingAnnotator | image_list         | mode='salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption'                                                                                       | Input no more than three images.                                                                                               |
+| composition | animate_anything   | AnimateAnythingAnnotator   | image + video      | mode='salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption'                                                                                       | Video for conditional redrawing; images for reference generation.                                                              |
+| composition | swap_anything      | SwapAnythingAnnotator      | image + video      | mode='masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0 | Video for conditional redrawing; images for reference generation.<br>Comma-separated mode: first for video, second for images. |
+| composition | expand_anything    | ExpandAnythingAnnotator    | image + image_list | mode='masktrack/bboxtrack/label/caption'<br>direction=left/right/up/down<br>expand_ratio=0~1.0<br>expand_num=80 (default)                                        | First image for extension edit; others for reference.<br>Comma-separated mode: first for video, second for images.             |
+| composition | move_anything      | MoveAnythingAnnotator      | image + two bboxes | expand_num=80 (default)                                                                                                                                          | First image for initial frame reference; others represented by linear bbox changes.                                            |
+| composition | more_anything      | ...                        | ...                | ...                                                                                                                                                              | ...                                                                                                                            |
+## 4. Model Inference
+### 4.1 Execution Methods
+Both command line and Gradio demo are supported.
+1) Command Line: Refer to the `run_vace_ltx.sh` and `run_vace_wan.sh` scripts and invoke them based on the different task types. The input data needs to be preprocessed to obtain parameters such as `src_video`, `src_mask`, `src_ref_images` and `prompt`. An example command is as follows:
+```bash
+python vace/vace_wan_inference.py --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt <prompt>  # wan
+python vace/vace_ltx_inference.py --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt <prompt>  # ltx
+```
+2) Gradio Interactive: Launch the graphical interface for model inference and perform inference through interactions on the interface. The specific command is as follows:
+```bash
+python vace/gradios/vace_wan_demo.py  # wan
+python vace/gradios/vace_ltx_demo.py  # ltx
+```
+![gr_infer_demo](assets/materials/gr_infer_demo.jpg)
+3) End-to-End Inference: Refer to the `run_vace_pipeline.sh` script and invoke it based on different task types and input data. This pipeline includes both preprocessing and model inference, thereby requiring only user-provided materials. However, it offers relatively less flexibility. An example command is as follows:
+```bash
+python vace/vace_pipeline.py --base wan --task depth --video <path-to-video> --prompt <prompt>  # wan
+python vace/vace_pipeline.py --base lxt --task depth --video <path-to-video> --prompt <prompt>  # ltx
+```
+### 4.2 Inference Examples
+We provide test examples under different tasks, enabling users to validate according to their needs. These include **task**, **sub-tasks**, **original inputs** (ori_videos and ori_images), **model inputs** (src_video, src_mask, src_ref_images, prompt), and **model outputs**.
+| task        | subtask            | src_video                                                                                                            | src_mask                                                                                                           | src_ref_images                                                                                                                                                                                                                                                                                                | out_video                                                                                                              | prompt                                                                                                                                                                                                                                                                                                                                            | ori_video                                                                                                            | ori_images                                                                                                                                                                                                                                                                                            |
+|-------------|--------------------|----------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
+| txt2vid     | txt2vid            |                                                                                                                      |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/txt2vid/out_video.mp4"></video>            | 狂风巨浪的大海，镜头缓缓推进，一艘渺小的帆船在汹涌的波涛中挣扎漂荡。海面上白沫翻滚，帆船时隐时现，仿佛随时可能被巨浪吞噬。天空乌云密布，雷声轰鸣，海鸥在空中盘旋尖叫。帆船上的人们紧紧抓住缆绳，努力保持平衡。画面风格写实，充满紧张和动感。近景特写，强调风浪的冲击力和帆船的摇晃                                                                |                                                                                                                      |                                                                                                                                                                                                                                                                                                       |
+| extension   | firstframe         | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4"></video>       | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4"></video>      |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/out_video.mp4"></video>         | 纪实摄影风格，前景是一位中国越野爱好者坐在越野车上，手持车载电台正在进行通联。他五官清晰，表情专注，眼神坚定地望向前方。越野车停在户外，车身略显脏污，显示出经历过的艰难路况。镜头从车外缓缓拉近，最后定格在人物的面部特写上，展现出他的坚定与热情。中景到近景，动态镜头运镜。                                                                    |                                                                                                                      | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/ori_image_1.png">                                                                                                                                                            |
+| repainting  | inpainting         | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4"></video>       | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4"></video>      |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/out_video.mp4"></video>         | 一只巨大的金色凤凰从繁华的城市上空展翅飞过，羽毛如火焰般璀璨，闪烁着温暖的光辉，翅膀雄伟地展开。凤凰高昂着头，目光炯炯，轻轻扇动翅膀，散发出淡淡的光芒。下方是熙熙攘攘的市中心，人群惊叹，车水马龙，红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道，捕捉这一壮丽的景象，营造出既神秘又辉煌的氛围。                                                | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/ori_video.mp4"></video>       |                                                                                                                                                                                                                                                                                                       |
+| repainting  | outpainting        | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4"></video>      | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4"></video>     |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/out_video.mp4"></video>        | 赛博朋克风格，无人机俯瞰视角下的现代西安城墙，镜头穿过永宁门时泛起金色涟漪，城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起，现代与古代的交融，城墙上的灯光闪烁，形成时空隧道的效果。全息投影技术展现历史变迁，粒子重组特效细腻逼真。大远景逐渐过渡到特写，聚焦于城门特效。                              | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/ori_video.mp4"></video>      |                                                                                                                                                                                                                                                                                                       |
+| control     | depth              | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4"></video>            |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/out_video.mp4"></video>              | 一群年轻人在天空之城拍摄集体照。画面中，一对年轻情侣手牵手，轻声细语，相视而笑，周围是飞翔的彩色热气球和闪烁的星星，营造出浪漫的氛围。天空中，暖阳透过飘浮的云朵，洒下斑驳的光影。镜头以近景特写开始，随着情侣间的亲密互动，缓缓拉远。                                                                                                            | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/ori_video.mp4"></video>            |                                                                                                                                                                                                                                                                                                       |
+| control     | flow               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4"></video>             |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/out_video.mp4"></video>               | 纪实摄影风格，一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中，溅起晶莹的水花。画面以慢镜头捕捉这一瞬间，水花在空中绽放，形成美丽的弧线。玻璃杯中的牛奶纯白，番茄的鲜红与之形成鲜明对比。背景简洁，突出主体。近景特写，垂直俯视视角，展现细节之美。                                                                                                    | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/ori_video.mp4"></video>             |                                                                                                                                                                                                                                                                                                       |
+| control     | gray               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4"></video>             |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/out_video.mp4"></video>               | 镜头缓缓向右平移，身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬，眼神明亮而充满活力。背景是秋天红色和黄色的树叶，阳光透过树叶的缝隙洒下斑驳光影，营造出温馨自然的氛围。画面风格清新自然，仿佛夏日午后的一抹清凉。中景人像，强调自然光效和细腻的皮肤质感。                                                              | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/ori_video.mp4"></video>             |                                                                                                                                                                                                                                                                                                       |
+| control     | pose               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4"></video>             |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/out_video.mp4"></video>               | 在一个热带的庆祝派对上，一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈，年轻人兴奋地举杯碰撞，孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光，营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间，温暖的阳光映照着他们幸福的面庞。                                                                              | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/ori_video.mp4"></video>             |                                                                                                                                                                                                                                                                                                       |
+| control     | scribble           | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4"></video>         |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/out_video.mp4"></video>           | 画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙，尘埃反射着阳光。镜头快速切换至城墙上的砖石特写，阳光温暖地洒落，勾勒出每一块砖块的细腻纹理。整体画质清晰华丽，运镜流畅如水。                                                                                                                                                    | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/ori_video.mp4"></video>         |                                                                                                                                                                                                                                                                                                       |
+| control     | layout             | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4"></video>           |                                                                                                                    |                                                                                                                                                                                                                                                                                                               | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/out_video.mp4"></video>             | 视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中，幼鸟张开嘴巴等待食物。随后，成鸟飞走，幼鸟继续等待。成鸟再次飞回，带回食物喂养幼鸟。整个视频的拍摄角度固定，聚焦于巢穴和鸟类的互动，背景是模糊的绿色植被，强调了鸟类的自然行为和生态环境。                                                                                      | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/ori_video.mp4"></video>           |                                                                                                                                                                                                                                                                                                       |
+| reference   | face               |                                                                                                                      |                                                                                                                    | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png">                                                                                                                                                                      | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/face/out_video.mp4"></video>               | 视频展示了一位长着尖耳朵的老人，他有一头银白色的长发和小胡子，穿着一件色彩斑斓的长袍，内搭金色衬衫，散发出神秘与智慧的气息。背景为一个华丽宫殿的内部，金碧辉煌。灯光明亮，照亮他脸上的神采奕奕。摄像机旋转动态拍摄，捕捉老人轻松挥手的动作。                                                                                                      |                                                                                                                      | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/face/ori_image_1.png">                                                                                                                                                                  |
+| reference   | object             |                                                                                                                      |                                                                                                                    | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png">                                                                                                                                                                    | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/object/out_video.mp4"></video>             | 经典游戏角色马里奥在绿松石色水下世界中，四周环绕着珊瑚和各种各样的热带鱼。马里奥兴奋地向上跳起，摆出经典的欢快姿势，身穿鲜明的蓝色潜水服，红色的潜水面���上印有“M”标志，脚上是一双潜水靴。背景中，水泡随波逐流，浮现出一个巨大而友好的海星。摄像机从水底向上快速移动，捕捉他跃出水面的瞬间，灯光明亮而流动。该场景融合了动画与幻想元素，令人惊叹。 |                                                                                                                      | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/object/ori_image_1.png">                                                                                                                                                                |
+| composition | reference_anything |                                                                                                                      |                                                                                                                    | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png">,<img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/out_video.mp4"></video> | 一名打扮成超人的男子自信地站着，面对镜头，肩头有一只充满活力的毛绒黄色鸭子。他留着整齐的短发和浅色胡须，鸭子有橙色的喙和脚，它的翅膀稍微展开，脚分开以保持稳定。他的表情严肃而坚定。他穿着标志性的蓝红超人服装，胸前有黄色“S”标志。斗篷在他身后飘逸。背景有行人。相机位于视线水平，捕捉角色的整个上半身。灯光均匀明亮。                           |                                                                                                                      | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/ori_image_1.png">,<img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/ori_image_2.png"> |
+| composition | swap_anything      | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4"></video>    | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4"></video>   | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png">                                                                                                                                                             | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/out_video.mp4"></video>      | 视频展示了一个人在宽阔的草原上骑马。他有淡紫色长发，穿着传统服饰白上衣黑裤子，动画建模画风，看起来像是在进行某种户外活动或者是在进行某种表演。背景是壮观的山脉和多云的天空，给人一种宁静而广阔的感觉。整个视频的拍摄角度是固定的，重点展示了骑手和他的马。                                                                                        | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/ori_video.mp4"></video>    | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/ori_image_1.jpg">                                                                                                                                                         |
+| composition | expand_anything    | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4"></video>  | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4"></video> | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png">                                                                                                                                                           | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/out_video.mp4"></video>    | 古典油画风格，背景是一条河边，画面中央一位成熟优雅的女人，穿着长裙坐在椅子上。她双手从怀里取出打开的红色心形墨镜戴上。固定机位。                                                                                                                                                                                                                  |                                                                                                                      | <img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/ori_image_1.jpeg">,<img  style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/ori_image_2.png">      |
+## 5. Limitations
+- VACE-LTX-Video-0.9
+  - The prompt significantly impacts video generation quality on LTX-Video. It must be extended in accordance with the methods described in this [system prompt](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground/blob/main/assets/system_prompt_i2v.txt). We also provide input parameters for using prompt extension (--use_prompt_extend).
+  - This model is intended for experimental research validation within the VACE paper and may not guarantee performance in real-world scenarios. However, its inference speed is very fast, capable of creating a video in 25 seconds with 40 steps on an A100 GPU, making it suitable for preliminary data and creative validation.
+- VACE-Wan2.1-1.3B-Preview
+  - This model mainly keeps the original Wan2.1-T2V-1.3B's video quality while supporting various tasks.
+  - When you encounter failure cases with specific tasks, we recommend trying again with a different seed and adjusting the prompt.

__init__.py ADDED Viewed

File without changes

assets/images/girl.png ADDED Viewed

Git LFS Details

SHA256: f461a83c0772dbe93a05ae6b8ce9fa77f0e7f5facb4402685b5410c0dc18397f
Pointer size: 131 Bytes
Size of remote file: 836 kB

assets/images/snake.png ADDED Viewed

Git LFS Details

SHA256: 60ae5e275f64de6ca99c5e63eaea6812fe09a6d7e7a233e483e700122ad08124
Pointer size: 131 Bytes
Size of remote file: 446 kB

assets/images/test.jpg ADDED Viewed

Git LFS Details

SHA256: 71549d76843c4ee220f37f45e87f0dfc22079d1bc5fbe3f52fe2ded2b9454a3b
Pointer size: 131 Bytes
Size of remote file: 143 kB

assets/images/test2.jpg ADDED Viewed

assets/images/test3.jpg ADDED Viewed

Git LFS Details

SHA256: bee71955dac07594b21937c2354ab5b7bd3f3321447202476178dab5ceead497
Pointer size: 131 Bytes
Size of remote file: 214 kB

assets/masks/test.png ADDED Viewed

assets/masks/test2.png ADDED Viewed

assets/materials/gr_infer_demo.jpg ADDED Viewed

Git LFS Details

SHA256: 9b4f0df3c602da88e707262029d78284b3b5857e2bac413edef6f117e3ddb8be
Pointer size: 131 Bytes
Size of remote file: 320 kB

assets/materials/gr_pre_demo.jpg ADDED Viewed

Git LFS Details

SHA256: 6939180a97bd5abfc8d90bef6b31e949c591e2d75f5719e0eac150871d4aaae2
Pointer size: 131 Bytes
Size of remote file: 267 kB

assets/materials/tasks.png ADDED Viewed

Git LFS Details

SHA256: 1f1c4b3f3e6ae927880fbe2f9a46939cc98824bb56c2753c975a2e3c4820830b
Pointer size: 131 Bytes
Size of remote file: 709 kB

assets/materials/teaser.jpg ADDED Viewed

Git LFS Details

SHA256: 87ce75e8dcbf1536674d3a951326727e0aff80192f52cf7388b34c03f13f711f
Pointer size: 131 Bytes
Size of remote file: 892 kB

assets/videos/test.mp4 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2195efbd92773f1ee262154577c700e9c3b7a4d7d04b1a2ac421db0879c696b0
+size 737090

assets/videos/test2.mp4 ADDED Viewed

Binary file (79.6 kB). View file

pyproject.toml ADDED Viewed

	@@ -0,0 +1,75 @@

+[build-system]
+requires = ["setuptools>=42", "wheel"]
+build-backend = "setuptools.build_meta"
+[project]
+name = "vace"
+version = "1.1.0"
+description = "VACE: All-in-One Video Creation and Editing"
+authors = [
+    { name = "VACE Team", email = "wan.ai@alibabacloud.com" }
+]
+requires-python = ">=3.10,<4.0"
+readme = "README.md"
+dependencies = [
+    "torch>=2.5.1",
+    "torchvision>=0.20.1",
+    "opencv-python>=4.9.0.80",
+    "diffusers>=0.31.0",
+    "transformers>=4.49.0",
+    "tokenizers>=0.20.3",
+    "accelerate>=1.1.1",
+    "gradio>=5.0.0",
+    "numpy>=1.23.5,<2",
+    "tqdm",
+    "imageio",
+    "easydict",
+    "ftfy",
+    "dashscope",
+    "imageio-ffmpeg",
+    "flash_attn",
+    "decord",
+    "einops",
+    "scikit-image",
+    "scikit-learn",
+    "pycocotools",
+    "timm",
+    "onnxruntime-gpu",
+    "BeautifulSoup4"
+]
+[project.optional-dependencies]
+ltx = [
+    "ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1"
+]
+wan = [
+    "wan@git+https://github.com/Wan-Video/Wan2.1"
+]
+annotator = [
+    "insightface",
+    "sam-2@git+https://github.com/facebookresearch/sam2.git",
+    "segment-anything@git+https://github.com/facebookresearch/segment-anything.git",
+    "groundingdino@git+https://github.com/IDEA-Research/GroundingDINO.git",
+    "ram@git+https://github.com/xinyu1205/recognize-anything.git",
+    "raft@git+https://github.com/martin-chobanyan-sdc/RAFT.git"
+]
+[project.urls]
+homepage = "https://ali-vilab.github.io/VACE-Page/"
+documentation = "https://ali-vilab.github.io/VACE-Page/"
+repository = "https://github.com/ali-vilab/VACE"
+hfmodel = "https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38"
+msmodel = "https://modelscope.cn/collections/VACE-8fa5fcfd386e43"
+paper = "https://arxiv.org/abs/2503.07598"
+[tool.setuptools]
+packages = { find = {} }
+[tool.black]
+line-length = 88
+[tool.isort]
+profile = "black"
+[tool.mypy]
+strict = true

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ -r requirements/framework.txt

requirements/annotator.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+insightface
+git+https://github.com/facebookresearch/sam2.git
+git+https://github.com/facebookresearch/segment-anything.git
+git+https://github.com/IDEA-Research/GroundingDINO.git
+git+https://github.com/xinyu1205/recognize-anything.git
+git+https://github.com/martin-chobanyan-sdc/RAFT.git

requirements/framework.txt ADDED Viewed

	@@ -0,0 +1,26 @@

+torch>=2.5.1
+torchvision>=0.20.1
+opencv-python>=4.9.0.80
+diffusers>=0.31.0
+transformers>=4.49.0
+tokenizers>=0.20.3
+accelerate>=1.1.1
+gradio>=5.0.0
+numpy>=1.23.5,<2
+tqdm
+imageio
+easydict
+ftfy
+dashscope
+imageio-ffmpeg
+flash_attn
+decord
+einops
+scikit-image
+scikit-learn
+pycocotools
+timm
+onnxruntime-gpu
+BeautifulSoup4
+#ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1
+#wan@git+https://github.com/Wan-Video/Wan2.1

run_vace_ltx.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#------------------------ Gadio ------------------------#
+python vace/gradios/vace_ltx_demo.py
+#------------------------ CLI ------------------------#
+# txt2vid txt2vid
+python vace/vace_ltx_inference.py --prompt "A sailboat with a white sail is navigating through rough, dark blue ocean waters under a stormy sky filled with thick, gray clouds. The boat tilts significantly as it rides the waves, and several seagulls fly around it. The scene is captured in real-life footage, with the camera angle shifting to follow the movement of the boat, emphasizing its struggle against the turbulent sea. The lighting is dim, reflecting the overcast conditions, and the overall tone is dramatic and intense."
+# extension firstframe
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4" --prompt "A man in a black long-sleeve shirt is sitting inside a white vehicle, holding a walkie-talkie. He looks out the window with a serious expression. The camera gradually zooms in on his face, emphasizing his focused gaze. The background is blurred, but it appears to be an outdoor setting with some structures visible. The lighting is natural and bright, suggesting daytime. The scene is captured in real-life footage."
+# repainting inpainting
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4" --prompt "A huge golden phoenix spread its wings and flew over the bustling city, its feathers shining brightly like flames, shimmering with warm radiance, and its wings spreading out majestic.The city below is filled with tall buildings adorned with colorful lights and billboards, creating a vibrant urban landscape. The camera follows the phoenix's flight from a high angle, capturing the grandeur of both the creature and the cityscape. The lighting is predominantly artificial, casting a warm glow on the buildings and streets, contrasting with the dark sky. The scene is a blend of animation and real-life footage, seamlessly integrating the fantastical element of the phoenix into a realistic city environment."
+# repainting outpainting
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4" --prompt "The video begins with an aerial view of a grand, ancient gate illuminated by warm lights against the evening sky. The gate is surrounded by lush greenery and traditional Chinese architecture, including a prominent red-roofed building in the background. As the scene progresses, the gate's lighting intensifies, and a dynamic light show starts, featuring bright yellow and blue streaks emanating from the gate's archway, creating a visually striking effect. The light show continues to build in intensity, with more vibrant colors and patterns emerging. The camera angle remains static, capturing the entire spectacle from above. The lighting transitions from the natural dusk hues to the vivid, artificial lights of the display, enhancing the dramatic atmosphere. The scene is captured in real-life footage, showcasing the blend of historical architecture and modern light technology."
+# control depth
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4" --prompt "In this enchanting animated scene, a group of young people gathers in a whimsical sky city to take a group photo, yet the photographer consistently captures the tender moments shared between couples. In the foreground, a young couple holds hands, while gazing into each other's eyes, smiles lighting up their faces. Surrounding them, vibrant hot air balloons float gracefully, and twinkling stars add a touch of magic to the atmosphere. The background features a dreamy sky, where warm sunlight filters through fluffy clouds, creating dappled shadows on the scene. The camera begins with a close-up, focusing on the couple's affectionate gestures, then slowly zooms out to reveal the warmth and vibrancy of the entire setting. The lighting is soft and romantic, casting a golden hue. The scene is captured in real-life footage"
+# control flow
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4" --prompt "A bright red tomato was placed in a glass of milk, splashing water and creating ripples. The tomato sinks to the bottom of the glass, and the milk keeps shaking. The camera angle is a close-up shot, focusing on glass and milk. The bright and natural lighting highlights the pure white of the milk and the bright red of the tomatoes. This scene seems to be a real shot."
+# control gray
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4" --prompt "A young woman with long, straight purple hair is standing in front of a lush autumn background. She is wearing an off-shoulder light yellow dress and smiling at the camera. The wind gently blows her hair to one side. The lighting is natural and bright, highlighting her features and the vibrant red and yellow leaves behind her. The scene is captured in real-life footage with a steady camera angle focusing on the woman's upper body."
+# control pose
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4" --prompt "In a tropical celebration, a family gathers around a long table nestled under swaying palm trees, basking in the warmth of the sun. The table is laden with an array of exotic dishes, each colorful plate invitingly displayed. Elders engage in joyful conversations, their faces animated, while young adults raise their glasses in enthusiastic toasts. Children dash across the sandy beach. The background features a stunning azure ocean under a bright sun. The camera angle is in a dynamic mid-shot, fluidly capturing the moments of laughter and connection, while the lighting is bright and golden. The scene is presented in a realistic style."
+# control scribble
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4" --prompt "In this visually stunning scene, a vivid, neon-colored drone zips past the surreal West Xi'an ancient city wall at a low altitude, kicking up a cloud of glittering dust that catches the sunlight in a spectrum of colors. The camera swiftly shifts to a close-up of the bricks on the wall, where warm sunlight illuminates each stone, revealing intricate textures that tell tales of history. The background is rich with the majestic, timeworn wall, blending seamlessly into a dreamy atmosphere. The camera angle is at a dynamic angle, following the drone's swift movements with smooth transitions. The lighting is bright and vibrant, casting a magical glow. This scene is realized in striking animation."
+# control layout
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4" --prompt "A small bird with a grey head, white chest, and orange tail feathers lands on a nest in a tree. The nest is made of twigs and leaves and contains three baby birds with their mouths open, waiting to be fed. The adult bird feeds the baby birds one by one, then takes off from the nest. The background is a blurred green forest, providing a natural setting for the scene. The camera angle is steady, focusing on the nest and the birds, capturing the intimate moment of feeding. The lighting is bright and natural, highlighting the colors of the birds and the nest. The scene appears to be real-life footage."
+# reference face
+python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png" --prompt "The video unfolds with an elderly man sporting pointy ears, his long silver hair cascading down, and a neatly trimmed goatee, wearing a vibrant, colorful robe over a golden shirt that radiates an aura of mystery and wisdom. The background is the interior of a magnificent palace, shining brilliantly. The camera dynamically rotates to capture this enchanting moment from various angles. The lighting is bright casting a warm glow. This scene seems to be a real shot."
+# reference object
+python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png" --prompt "Classic game character Mario is submerged in a turquoise underwater world, surrounded by vibrant corals and various tropical fish. He jumps excitedly upwards, striking his iconic cheerful pose while wearing a bright blue wetsuit and a red diving mask adorned with an “M” logo. His feet are equipped with sturdy diving boots. In the background, bubbles drift with the currents, revealing a large and friendly starfish nearby. The camera moves swiftly from the seabed upwards, capturing the moment he breaks the surface of the water. The lighting is bright and flowing. The scene combines animated and fantastical elements, creating a visually stunning experience."
+# composition reference_anything
+python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png,benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png" --prompt "A man dressed as Superman stands confidently facing the camera, with a lively plush yellow duck perched on his shoulder. The man has neatly trimmed short hair and light stubble, while the duck features an orange beak and feet with slightly spread wings and legs positioned to maintain balance. The man's expression is serious and determined. He wears the iconic blue and red Superman costume, complete with a yellow "S" emblem on his chest and a cape flowing behind him. The background includes pedestrians walking by, adding to the scene's atmosphere. The camera is positioned at eye level, capturing the man's entire upper body. The lighting is bright and even, illuminating both the man and the duck. The scene appears to be real-life footage."
+# composition swap_anything
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png" --prompt "The video depicts a person with long, pale purple hair riding a horse across a vast grassland. The individual wears traditional attire featuring a white top and black pants, styled in an animation modeling approach, suggesting engagement in some outdoor activity or performance. The backdrop showcases magnificent mountains under a sky dotted with clouds, imparting a serene and expansive atmosphere. The camera angle is fixed throughout the video, focusing on the rider and his horse as they move through the landscape. The lighting is natural, highlighting the serene majesty of the scene. The scene is animated, capturing the tranquil beauty of the vast plains and towering mountains."
+# composition expand_anything
+python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png" --prompt "Set in the style of a classical oil painting, the scene unfolds along the bank of a river. At the center of the frame is a mature and elegant woman seated on a chair, wearing a flowing long dress. She gracefully lifts her hands from her lap to don a pair of red, heart-shaped sunglasses. The background features the tranquil river and lush surroundings, enhancing the serene atmosphere. The camera remains fixed, emphasizing the woman and her refined actions. The lighting is soft and warm, imitating the natural illumination typical of oil paintings. The scene is animated to replicate the timeless beauty and charm of classical art."

run_vace_pipeline.sh ADDED Viewed

	@@ -0,0 +1,27 @@

+#------------------------ Pipeline ------------------------#
+# extension firstframe
+python vace/vace_pipeline.py --base wan --task frameref --mode firstframe --image "benchmarks/VACE-Benchmark/assets/examples/firstframe/ori_image_1.png" --prompt "纪实摄影风格，前景是一位中国越野爱好者坐在越野车上，手持车载电台正在进行通联。他五官清晰，表情专注，眼神坚定地望向前方。越野车停在户外，车身略显脏污，显示出经历过的艰难路况。镜头从车外缓缓拉近，最后定格在人物的面部特写上，展现出他的坚定与热情。中景到近景，动态镜头运镜。"
+# repainting inpainting
+python vace/vace_pipeline.py --base wan --task inpainting --mode salientmasktrack --maskaug_mode original_expand --maskaug_ratio 0.5 --video "benchmarks/VACE-Benchmark/assets/examples/inpainting/ori_video.mp4" --prompt "一只巨大的金色凤凰从繁华的城市上空展翅飞过，羽毛如火焰般璀璨，闪烁着温暖的光辉，翅膀雄伟地展开。凤凰高昂着头，目光炯炯，轻轻扇动翅膀，散发出淡淡的光芒。下方是熙熙攘攘的市中心，人群惊叹，车水马龙，红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道，捕捉这一壮丽的景象，营造出既神秘又辉煌的氛围。"
+# repainting outpainting
+python vace/vace_pipeline.py --base wan --task outpainting --direction 'up,down,left,right' --expand_ratio 0.3 --video "benchmarks/VACE-Benchmark/assets/examples/outpainting/ori_video.mp4" --prompt "赛博朋克风格，无人机俯瞰视角下的现代西安城墙，镜头穿过永宁门时泛起金色涟漪，城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起，现代与古代的交融，城墙上的灯光闪烁，形成时空隧道的效果。全息投影技术展现历史变迁，粒子重组特效细腻逼真。大远景逐渐过渡到特写，聚焦于城门特效。"
+# control depth
+python vace/vace_pipeline.py --base wan --task depth --video "benchmarks/VACE-Benchmark/assets/examples/depth/ori_video.mp4" --prompt "一群年轻人在天空之城拍摄集体照。画面中，一对年轻情侣手牵手，轻声细语，相视而笑，周围是飞翔的彩色热气球和闪烁的星星，营造出浪漫的氛围。天空中，暖阳透过飘浮的云朵，洒下斑驳的光影。镜头以近景特写开始，随着情侣间的亲密互动，缓缓拉远。"
+# control flow
+python vace/vace_pipeline.py --base wan --task flow --video "benchmarks/VACE-Benchmark/assets/examples/flow/ori_video.mp4" --prompt "纪实摄影风格，一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中，溅起晶莹的水花。画面以慢镜头捕捉这一瞬间，水花在空中绽放，形成美丽的弧线。玻璃杯中的牛奶纯白，番茄的鲜红与之形成鲜明对比。背景简洁，突出主体。近景特写，垂直俯视视角，展现细节之美。"
+# control gray
+python vace/vace_pipeline.py --base wan --task gray --video "benchmarks/VACE-Benchmark/assets/examples/gray/ori_video.mp4" --prompt "镜头缓缓向右平移，身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬，眼神明亮而充满活力。背景是秋天红色和黄色的树叶，阳光透过树叶的缝隙洒下斑驳光影，营造出温馨自然的氛围。画面风格清新自然，仿佛夏日午后的一抹清凉。中景人像，强调自然光效和细腻的皮肤质感。"
+# control pose
+python vace/vace_pipeline.py --base wan --task pose --video "benchmarks/VACE-Benchmark/assets/examples/pose/ori_video.mp4" --prompt "在一个热带的庆祝派对上，一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈，年轻人兴奋地举杯碰撞，孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光，营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间，温暖的阳光映照着他们幸福的面庞。"
+# control scribble
+python vace/vace_pipeline.py --base wan --task scribble --video "benchmarks/VACE-Benchmark/assets/examples/scribble/ori_video.mp4" --prompt "画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙，尘埃反射着阳光。镜头快速切换至城墙上的砖石特写，阳光温暖地洒落，勾勒出每一块砖块的细腻纹理。整体画质清晰华丽，运镜流畅如水。"
+# control layout
+python vace/vace_pipeline.py --base wan --task layout_track --mode bboxtrack --bbox '54,200,614,448' --maskaug_mode bbox_expand --maskaug_ratio 0.2 --label 'bird' --video "benchmarks/VACE-Benchmark/assets/examples/layout/ori_video.mp4" --prompt "视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中，幼鸟张开嘴巴等待食物。随后，成鸟飞走，幼鸟继续等待。成鸟再次飞回，带回食物喂养幼鸟。���个视频的拍摄角度固定，聚焦于巢穴和鸟类的互动，背景是模糊的绿色植被，强调了鸟类的自然行为和生态环境。"

run_vace_preproccess.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#------------------------ Gadio ------------------------#
+python vace/gradios/vace_preproccess_demo.py
+#------------------------ Video ------------------------#
+python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task flow --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task gray --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task pose --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task scribble --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task frameref --mode firstframe --image assets/images/test.jpg
+python vace/vace_preproccess.py --task frameref --mode lastframe --expand_num 55 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task frameref --mode firstlastframe --image assets/images/test.jpg,assets/images/test2.jpg
+python vace/vace_preproccess.py --task clipref --mode firstclip --expand_num 66 --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task clipref --mode lastclip --expand_num 55 --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task clipref --mode firstlastclip --video assets/videos/test.mp4,assets/videos/test2.mp4
+python vace/vace_preproccess.py --task inpainting --mode salient --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode mask --mask assets/masks/test.png --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode salientmasktrack --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode salientbboxtrack --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode masktrack --mask assets/masks/test.png --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode bboxtrack --bbox 50,50,550,700 --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode label --label cat --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task inpainting --mode caption --caption 'boxing glove' --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task outpainting --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task outpainting --direction 'up,down,left,right' --expand_ratio 0.5 --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task layout_bbox --bbox '50,50,550,700 500,150,750,700' --label 'person'
+python vace/vace_preproccess.py --task layout_track --mode masktrack --mask assets/masks/test.png  --label 'cat' --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task layout_track --mode bboxtrack --bbox '50,50,550,700' --label 'cat' --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task layout_track --mode label --label 'cat' --maskaug_mode hull_expand --maskaug_ratio 0.1  --video assets/videos/test.mp4
+python vace/vace_preproccess.py --task layout_track --mode caption --caption 'boxing glove' --maskaug_mode bbox --video assets/videos/test.mp4 --label 'glove'
+#------------------------ Image ------------------------#
+python vace/vace_preproccess.py --task image_face --image assets/images/test3.jpg
+python vace/vace_preproccess.py --task image_salient --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_inpainting --mode 'salientbboxtrack' --image assets/images/test2.jpg
+python vace/vace_preproccess.py --task image_inpainting --mode 'salientmasktrack' --maskaug_mode hull_expand --maskaug_ratio 0.3  --image assets/images/test2.jpg
+python vace/vace_preproccess.py --task image_reference --mode plain --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode salient --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode mask --mask assets/masks/test2.png --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode bbox --bbox 0,264,338,636 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode salientmasktrack --image assets/images/test.jpg   # easyway, recommend
+python vace/vace_preproccess.py --task image_reference --mode salientbboxtrack --bbox 0,264,338,636 --maskaug_mode original_expand --maskaug_ratio 0.2 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode masktrack --mask assets/masks/test2.png --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode bboxtrack --bbox 0,264,338,636 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode label --label 'cat' --image assets/images/test.jpg
+python vace/vace_preproccess.py --task image_reference --mode caption --caption 'flower' --maskaug_mode bbox --maskaug_ratio 0.3 --image assets/images/test.jpg
+#------------------------ Composition ------------------------#
+python vace/vace_preproccess.py --task reference_anything --mode salientmasktrack --image assets/images/test.jpg
+python vace/vace_preproccess.py --task reference_anything --mode salientbboxtrack --image assets/images/test.jpg,assets/images/test2.jpg
+python vace/vace_preproccess.py --task animate_anything --mode salientbboxtrack --video assets/videos/test.mp4 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task swap_anything --mode salientmasktrack --video assets/videos/test.mp4 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task swap_anything --mode label,salientbboxtrack --label 'cat' --maskaug_mode bbox --maskaug_ratio 0.3 --video assets/videos/test.mp4 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task swap_anything --mode label,plain --label 'cat' --maskaug_mode bbox --maskaug_ratio 0.3 --video assets/videos/test.mp4 --image assets/images/test.jpg
+python vace/vace_preproccess.py --task expand_anything --mode salientbboxtrack --direction 'left,right' --expand_ratio 0.5 --expand_num 80 --image assets/images/test.jpg,assets/images/test2.jpg
+python vace/vace_preproccess.py --task expand_anything --mode firstframe,plain --direction 'left,right' --expand_ratio 0.5 --expand_num 80 --image assets/images/test.jpg,assets/images/test2.jpg
+python vace/vace_preproccess.py --task move_anything --bbox '0,264,338,636 400,264,538,636' --expand_num 80 --label 'cat' --image assets/images/test.jpg

run_vace_wan.sh ADDED Viewed

	@@ -0,0 +1,48 @@

+#------------------------ Gadio ------------------------#
+python vace/gradios/vace_wan_demo.py
+#------------------------ CLI ------------------------#
+# txt2vid txt2vid
+python vace/vace_wan_inference.py --prompt "狂风巨浪的大海，镜头缓缓推进，一艘渺小的帆船在汹涌的波涛中挣扎漂荡。海面上白沫翻滚，帆船时隐时现，仿佛随时可能被巨浪吞噬。天空乌云密布，雷声轰鸣，海鸥在空中盘旋尖叫。帆船上的人们紧紧抓住缆绳，努力保持平衡。画面风格写实，充满紧张和动感。近景特写，强调风浪的冲击力和帆船的摇晃"
+# extension firstframe
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4" --prompt "纪实摄影风格，前景是一位中国越野爱好者坐在越野车上，手持车载电台正在进行通联。他五官清晰，表情专注，眼神坚定地望向前方。越野车停在户外，车身略显脏污，显示出经历过的艰难路况。镜头从车外缓缓拉近，最后定格在人物的面部特写上，展现出他的，动态镜头运镜。"
+# repainting inpainting
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4" --prompt "一只巨大的金色凤凰从繁华的城市上空展翅飞过，羽毛如火焰般璀璨，闪烁着温暖的光辉，翅膀雄伟地展开。凤凰高昂着头，目光炯炯，轻轻扇动翅膀，散发出淡淡的光芒。下方是熙熙攘攘的市中心，人群惊叹，车水马龙，红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道，捕捉这一壮丽的景象，营造出既神秘又辉煌的氛围。"
+# repainting outpainting
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4" --prompt "赛博朋克风格，无人机俯瞰视角下的现代西安城墙，镜头穿过永宁门时泛起金色涟漪，城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起，现代与古代的交融，城墙上的灯光闪烁，形成时空隧道的效果。全息投影技术展现历史变迁，粒子重组特效细腻逼真。大远景逐渐过渡到特写，聚焦于城门特效。"
+# control depth
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4" --prompt "一群年轻人在天空之城拍摄集体照。画面中，一对年轻情侣手牵手，轻声细语，相视而笑，周围是飞翔的彩色热气球和闪烁的星星，营造出浪漫的氛围。天空中，暖阳透过飘浮的云朵，洒下斑驳的光影。镜头以近景特写开始，随着情侣间的亲密互动，缓缓拉远。"
+# control flow
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4" --prompt "纪实摄影风格，一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中，溅起晶莹的水花。画面以慢镜头捕捉这一瞬间，水花在空中绽放，形成美丽的弧线。玻璃杯中的牛奶纯白，番茄的鲜红与之形成鲜明对比。背景简洁，突出主体。近景特写，垂直俯视视角，展现细节之美。"
+# control gray
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4" --prompt "镜头缓缓向右平移，身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬，眼神明亮而充满活力。背景是秋天红色和黄色的树叶，阳光透过树叶的缝隙洒下斑驳光影，营造出温馨自然的氛围。画面风格清新自然，仿佛夏日午后的一抹清凉。中景人像，强调自然光效和细腻的皮肤质感。"
+# control pose
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4" --prompt "在一个热带的庆祝派对上，一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈，年轻人兴奋地举杯碰撞，孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光，营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间，温暖的阳光映照着他们幸福的面庞。"
+# control scribble
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4" --prompt "画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙，尘埃反射着阳光。镜头快速切换至城墙上的砖石特写，阳光温暖地洒落，勾勒出每一块砖块的细腻纹理。整体画质清晰华丽，运镜流畅如水���"
+# control layout
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4" --prompt "视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中，幼鸟张开嘴巴等待食物。随后，成鸟飞走，幼鸟继续等待。成鸟再次飞回，带回食物喂养幼鸟。整个视频的拍摄角度固定，聚焦于巢穴和鸟类的互动，背景是模糊的绿色植被，强调了鸟类的自然行为和生态环境。"
+# reference face
+python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png" --prompt "视频展示了一位长着尖耳朵的老人，他有一头银白色的长发和小胡子，穿着一件色彩斑斓的长袍，内搭金色衬衫，散发出神秘与智慧的气息。背景为一个华丽宫殿的内部，金碧辉煌。灯光明亮，照亮他脸上的神采奕奕。摄像机旋转动态拍摄，捕捉老人轻松挥手的动作。"
+# reference object
+python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png" --prompt "经典游戏角色马里奥在绿松石色水下世界中，四周环绕着珊瑚和各种各样的热带鱼。马里奥兴奋地向上跳起，摆出经典的欢快姿势，身穿鲜明的蓝色潜水服，红色的潜水面罩上印有“M”标志，脚上是一双潜水靴。背景中，水泡随波逐流，浮现出一个巨大而友好的海星。摄像机从水底向上快速移动，捕捉他跃出水面的瞬间，灯光明亮而流动。该场景融合了动画与幻想元素，令人惊叹。"
+# composition reference_anything
+python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png,benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png" --prompt "一名打扮成超人的男子自信地站着，面对镜头，肩头有一只充满活力的毛绒黄色鸭子。他留着整齐的短发和浅色胡须，鸭子有橙色的喙和脚，它的翅膀稍微展开，脚分开以保持稳定。他的表情严肃而坚定。他穿着标志性的蓝红超人服装，胸前有黄色“S”标志。斗篷在他身后飘逸。背景有行人。相机位于视线水平，捕捉角色的整个上半身。灯光均匀明亮。"
+# composition swap_anything
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png" --prompt "视频展示了一个人在宽阔的草原上骑马。他有淡紫色长发，穿着传统服饰白上衣黑裤子，动画建模画风，看起来像是在进行某种户外活动或者是在进行某种表演。背景是壮观的山脉云的天空，给人一种宁静而广阔的感觉。整个视频的拍摄角度是固定的，重点展示了骑手和他的马。"
+# composition expand_anything
+python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png" --prompt "古典油画风格，背景是一条河边，画面中央一位成熟优雅的女人，穿着长裙坐在椅子上。她双手从怀里取出打开的红色心形墨镜戴上。固定机位。"

tests/test_annotators.py ADDED Viewed

	@@ -0,0 +1,568 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import unittest
+import numpy as np
+from PIL import Image
+from vace.annotators.utils import read_video_frames
+from vace.annotators.utils import save_one_video
+class AnnotatorTest(unittest.TestCase):
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+        self.save_dir = './cache/test_annotator'
+        if not os.path.exists(self.save_dir):
+            os.makedirs(self.save_dir)
+        # load test image
+        self.image_path = './assets/images/test.jpg'
+        self.image = Image.open(self.image_path).convert('RGB')
+        # load test video
+        self.video_path = './assets/videos/test.mp4'
+        self.frames = read_video_frames(self.video_path)
+    def tearDown(self):
+        super().tearDown()
+    @unittest.skip('')
+    def test_annotator_gray_image(self):
+        from vace.annotators.gray import GrayAnnotator
+        cfg_dict = {}
+        anno_ins = GrayAnnotator(cfg_dict)
+        anno_image = anno_ins.forward(np.array(self.image))
+        save_path = os.path.join(self.save_dir, 'test_gray_image.png')
+        Image.fromarray(anno_image).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_gray_video(self):
+        from vace.annotators.gray import GrayAnnotator
+        cfg_dict = {}
+        anno_ins = GrayAnnotator(cfg_dict)
+        ret_frames = []
+        for frame in self.frames:
+            anno_frame = anno_ins.forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        save_path = os.path.join(self.save_dir, 'test_gray_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_gray_video_2(self):
+        from vace.annotators.gray import GrayVideoAnnotator
+        cfg_dict = {}
+        anno_ins = GrayVideoAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_gray_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_pose_image(self):
+        from vace.annotators.pose import PoseBodyFaceAnnotator
+        cfg_dict = {
+            "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
+            "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
+            "RESIZE_SIZE": 1024
+        }
+        anno_ins = PoseBodyFaceAnnotator(cfg_dict)
+        anno_image = anno_ins.forward(np.array(self.image))
+        save_path = os.path.join(self.save_dir, 'test_pose_image.png')
+        Image.fromarray(anno_image).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_pose_video(self):
+        from vace.annotators.pose import PoseBodyFaceAnnotator
+        cfg_dict = {
+            "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
+            "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
+            "RESIZE_SIZE": 1024
+        }
+        anno_ins = PoseBodyFaceAnnotator(cfg_dict)
+        ret_frames = []
+        for frame in self.frames:
+            anno_frame = anno_ins.forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        save_path = os.path.join(self.save_dir, 'test_pose_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_pose_video_2(self):
+        from vace.annotators.pose import PoseBodyFaceVideoAnnotator
+        cfg_dict = {
+            "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
+            "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
+            "RESIZE_SIZE": 1024
+        }
+        anno_ins = PoseBodyFaceVideoAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_pose_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    # @unittest.skip('')
+    def test_annotator_depth_image(self):
+        from vace.annotators.depth import DepthAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/depth/depth_anything_v2_vitl.pth"
+        }
+        anno_ins = DepthAnnotator(cfg_dict)
+        anno_image = anno_ins.forward(np.array(self.image))
+        save_path = os.path.join(self.save_dir, 'test_depth_image.png')
+        Image.fromarray(anno_image).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    # @unittest.skip('')
+    def test_annotator_depth_video(self):
+        from vace.annotators.depth import DepthAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/depth/depth_anything_v2_vitl.pth"
+        }
+        anno_ins = DepthAnnotator(cfg_dict)
+        ret_frames = []
+        for frame in self.frames:
+            anno_frame = anno_ins.forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        save_path = os.path.join(self.save_dir, 'test_depth_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_depth_video_2(self):
+        from vace.annotators.depth import DepthVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/depth/dpt_hybrid-midas-501f0c75.pt"
+        }
+        anno_ins = DepthVideoAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_depth_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_scribble_image(self):
+        from vace.annotators.scribble import ScribbleAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
+        }
+        anno_ins = ScribbleAnnotator(cfg_dict)
+        anno_image = anno_ins.forward(np.array(self.image))
+        save_path = os.path.join(self.save_dir, 'test_scribble_image.png')
+        Image.fromarray(anno_image).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_scribble_video(self):
+        from vace.annotators.scribble import ScribbleAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
+        }
+        anno_ins = ScribbleAnnotator(cfg_dict)
+        ret_frames = []
+        for frame in self.frames:
+            anno_frame = anno_ins.forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        save_path = os.path.join(self.save_dir, 'test_scribble_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_scribble_video_2(self):
+        from vace.annotators.scribble import ScribbleVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
+        }
+        anno_ins = ScribbleVideoAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_scribble_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_flow_video(self):
+        from vace.annotators.flow import FlowVisAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/flow/raft-things.pth"
+        }
+        anno_ins = FlowVisAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_flow_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_frameref_video_1(self):
+        from vace.annotators.frameref import FrameRefExtractAnnotator
+        cfg_dict = {
+            "REF_CFG": [{"mode": "first", "proba": 0.1},
+                       {"mode": "last", "proba": 0.1},
+                       {"mode": "firstlast", "proba": 0.1},
+                       {"mode": "random", "proba": 0.1}],
+        }
+        anno_ins = FrameRefExtractAnnotator(cfg_dict)
+        ret_frames, ret_masks = anno_ins.forward(self.frames, ref_num=10)
+        save_path = os.path.join(self.save_dir, 'test_frameref_video_1.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_frameref_mask_1.mp4')
+        save_one_video(save_path, ret_masks, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_frameref_video_2(self):
+        from vace.annotators.frameref import FrameRefExpandAnnotator
+        cfg_dict = {}
+        anno_ins = FrameRefExpandAnnotator(cfg_dict)
+        ret_frames, ret_masks = anno_ins.forward(frames=self.frames, mode='lastclip', expand_num=50)
+        save_path = os.path.join(self.save_dir, 'test_frameref_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_frameref_mask_2.mp4')
+        save_one_video(save_path, ret_masks, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_outpainting_1(self):
+        from vace.annotators.outpainting import OutpaintingAnnotator
+        cfg_dict = {
+            "RETURN_MASK": True,
+            "KEEP_PADDING_RATIO": 1,
+            "MASK_COLOR": "gray"
+        }
+        anno_ins = OutpaintingAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image, direction=['right', 'up', 'down'], expand_ratio=0.5)
+        save_path = os.path.join(self.save_dir, 'test_outpainting_image.png')
+        Image.fromarray(ret_data['image']).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_outpainting_mask.png')
+        Image.fromarray(ret_data['mask']).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_outpainting_video_1(self):
+        from vace.annotators.outpainting import OutpaintingVideoAnnotator
+        cfg_dict = {
+            "RETURN_MASK": True,
+            "KEEP_PADDING_RATIO": 1,
+            "MASK_COLOR": "gray"
+        }
+        anno_ins = OutpaintingVideoAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(frames=self.frames, direction=['right', 'up', 'down'], expand_ratio=0.5)
+        save_path = os.path.join(self.save_dir, 'test_outpainting_video_1.mp4')
+        save_one_video(save_path, ret_data['frames'], fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_outpainting_mask_1.mp4')
+        save_one_video(save_path, ret_data['masks'], fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_outpainting_inner_1(self):
+        from vace.annotators.outpainting import OutpaintingInnerAnnotator
+        cfg_dict = {
+            "RETURN_MASK": True,
+            "KEEP_PADDING_RATIO": 1,
+            "MASK_COLOR": "gray"
+        }
+        anno_ins = OutpaintingInnerAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image, direction=['right', 'up', 'down'], expand_ratio=0.15)
+        save_path = os.path.join(self.save_dir, 'test_outpainting_inner_image.png')
+        Image.fromarray(ret_data['image']).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_outpainting_inner_mask.png')
+        Image.fromarray(ret_data['mask']).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_outpainting_inner_video_1(self):
+        from vace.annotators.outpainting import OutpaintingInnerVideoAnnotator
+        cfg_dict = {
+            "RETURN_MASK": True,
+            "KEEP_PADDING_RATIO": 1,
+            "MASK_COLOR": "gray"
+        }
+        anno_ins = OutpaintingInnerVideoAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.frames, direction=['right', 'up', 'down'], expand_ratio=0.15)
+        save_path = os.path.join(self.save_dir, 'test_outpainting_inner_video_1.mp4')
+        save_one_video(save_path, ret_data['frames'], fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+        save_path = os.path.join(self.save_dir, 'test_outpainting_inner_mask_1.mp4')
+        save_one_video(save_path, ret_data['masks'], fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_salient(self):
+        from vace.annotators.salient import SalientAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+        }
+        anno_ins = SalientAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image)
+        save_path = os.path.join(self.save_dir, 'test_salient_image.png')
+        Image.fromarray(ret_data).save(save_path)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_salient_video(self):
+        from vace.annotators.salient import SalientVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+        }
+        anno_ins = SalientVideoAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(self.frames)
+        save_path = os.path.join(self.save_dir, 'test_salient_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_layout_video(self):
+        from vace.annotators.layout import LayoutBboxAnnotator
+        cfg_dict = {
+            "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
+        }
+        anno_ins = LayoutBboxAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(bbox=[(544, 288, 744, 680), (1112, 240, 1280, 712)], frame_size=(720, 1280), num_frames=49, label='person')
+        save_path = os.path.join(self.save_dir, 'test_layout_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_layout_mask_video(self):
+        # salient
+        from vace.annotators.salient import SalientVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+        }
+        anno_ins = SalientVideoAnnotator(cfg_dict)
+        salient_frames = anno_ins.forward(self.frames)
+        # mask layout
+        from vace.annotators.layout import LayoutMaskAnnotator
+        cfg_dict = {
+            "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
+        }
+        anno_ins = LayoutMaskAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(salient_frames, label='cat')
+        save_path = os.path.join(self.save_dir, 'test_mask_layout_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_layout_mask_video_2(self):
+        # salient
+        from vace.annotators.salient import SalientVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+        }
+        anno_ins = SalientVideoAnnotator(cfg_dict)
+        salient_frames = anno_ins.forward(self.frames)
+        # mask layout
+        from vace.annotators.layout import LayoutMaskAnnotator
+        cfg_dict = {
+            "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
+            "USE_AUG": True
+        }
+        anno_ins = LayoutMaskAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(salient_frames, label='cat', mask_cfg={'mode': 'bbox_expand'})
+        save_path = os.path.join(self.save_dir, 'test_mask_layout_video_2.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_maskaug_video(self):
+        # salient
+        from vace.annotators.salient import SalientVideoAnnotator
+        cfg_dict = {
+            "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+        }
+        anno_ins = SalientVideoAnnotator(cfg_dict)
+        salient_frames = anno_ins.forward(self.frames)
+        # mask aug
+        from vace.annotators.maskaug import MaskAugAnnotator
+        cfg_dict = {}
+        anno_ins = MaskAugAnnotator(cfg_dict)
+        ret_frames = anno_ins.forward(salient_frames, mask_cfg={'mode': 'hull_expand'})
+        save_path = os.path.join(self.save_dir, 'test_maskaug_video.mp4')
+        save_one_video(save_path, ret_frames, fps=16)
+        print(('Testing %s: %s' % (type(self).__name__, save_path)))
+    @unittest.skip('')
+    def test_annotator_ram(self):
+        from vace.annotators.ram import RAMAnnotator
+        cfg_dict = {
+            "TOKENIZER_PATH": "models/VACE-Annotators/ram/bert-base-uncased",
+            "PRETRAINED_MODEL": "models/VACE-Annotators/ram/ram_plus_swin_large_14m.pth",
+        }
+        anno_ins = RAMAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image)
+        print(ret_data)
+    @unittest.skip('')
+    def test_annotator_gdino_v1(self):
+        from vace.annotators.gdino import GDINOAnnotator
+        cfg_dict = {
+            "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
+            "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
+            "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
+        }
+        anno_ins = GDINOAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image, caption="a cat and a vase")
+        print(ret_data)
+    @unittest.skip('')
+    def test_annotator_gdino_v2(self):
+        from vace.annotators.gdino import GDINOAnnotator
+        cfg_dict = {
+            "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
+            "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
+            "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
+        }
+        anno_ins = GDINOAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image, classes=["cat", "vase"])
+        print(ret_data)
+    @unittest.skip('')
+    def test_annotator_gdino_with_ram(self):
+        from vace.annotators.gdino import GDINORAMAnnotator
+        cfg_dict = {
+            "RAM": {
+                "TOKENIZER_PATH": "models/VACE-Annotators/ram/bert-base-uncased",
+                "PRETRAINED_MODEL": "models/VACE-Annotators/ram/ram_plus_swin_large_14m.pth",
+            },
+            "GDINO": {
+                "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
+                "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
+                "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
+            }
+        }
+        anno_ins = GDINORAMAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(self.image)
+        print(ret_data)
+    @unittest.skip('')
+    def test_annotator_sam2(self):
+        from vace.annotators.sam2 import SAM2VideoAnnotator
+        from vace.annotators.utils import save_sam2_video
+        cfg_dict = {
+            "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
+            "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
+        }
+        anno_ins = SAM2VideoAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(video=self.video_path, input_box=[0, 0, 640, 480])
+        video_segments = ret_data['annotations']
+        save_path = os.path.join(self.save_dir, 'test_sam2_video')
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
+        print(save_path)
+    @unittest.skip('')
+    def test_annotator_sam2salient(self):
+        from vace.annotators.sam2 import SAM2SalientVideoAnnotator
+        from vace.annotators.utils import save_sam2_video
+        cfg_dict = {
+            "SALIENT": {
+                "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
+            },
+            "SAM2": {
+                "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
+                "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
+            }
+        }
+        anno_ins = SAM2SalientVideoAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(video=self.video_path)
+        video_segments = ret_data['annotations']
+        save_path = os.path.join(self.save_dir, 'test_sam2salient_video')
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
+        print(save_path)
+    @unittest.skip('')
+    def test_annotator_sam2gdinoram_video(self):
+        from vace.annotators.sam2 import SAM2GDINOVideoAnnotator
+        from vace.annotators.utils import save_sam2_video
+        cfg_dict = {
+            "GDINO": {
+                "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
+                "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
+                "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
+            },
+            "SAM2": {
+                "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
+                "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
+            }
+        }
+        anno_ins = SAM2GDINOVideoAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(video=self.video_path, classes='cat')
+        video_segments = ret_data['annotations']
+        save_path = os.path.join(self.save_dir, 'test_sam2gdino_video')
+        if not os.path.exists(save_path):
+            os.makedirs(save_path)
+        save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
+        print(save_path)
+    @unittest.skip('')
+    def test_annotator_sam2_image(self):
+        from vace.annotators.sam2 import SAM2ImageAnnotator
+        cfg_dict = {
+            "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
+            "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
+        }
+        anno_ins = SAM2ImageAnnotator(cfg_dict)
+        ret_data = anno_ins.forward(image=self.image, input_box=[0, 0, 640, 480])
+        print(ret_data)
+    @unittest.skip('')
+    def test_annotator_prompt_extend(self):
+        from vace.annotators.prompt_extend import PromptExtendAnnotator
+        from vace.configs.prompt_preprocess import WAN_LM_ZH_SYS_PROMPT, WAN_LM_EN_SYS_PROMPT, LTX_LM_EN_SYS_PROMPT
+        cfg_dict = {
+            "MODEL_NAME": "models/VACE-Annotators/llm/Qwen2.5-3B-Instruct" # "Qwen2.5_3B"
+        }
+        anno_ins = PromptExtendAnnotator(cfg_dict)
+        ret_data = anno_ins.forward('一位男孩', system_prompt=WAN_LM_ZH_SYS_PROMPT)
+        print('wan_zh:', ret_data)
+        ret_data = anno_ins.forward('a boy', system_prompt=WAN_LM_EN_SYS_PROMPT)
+        print('wan_en:', ret_data)
+        ret_data = anno_ins.forward('a boy', system_prompt=WAN_LM_ZH_SYS_PROMPT)
+        print('wan_zh en:', ret_data)
+        ret_data = anno_ins.forward('a boy', system_prompt=LTX_LM_EN_SYS_PROMPT)
+        print('ltx_en:', ret_data)
+        from vace.annotators.utils import get_annotator
+        anno_ins = get_annotator(config_type='prompt', config_task='ltx_en', return_dict=False)
+        ret_data = anno_ins.forward('a boy', seed=2025)
+        print('ltx_en:', ret_data)
+        ret_data = anno_ins.forward('a boy')
+        print('ltx_en:', ret_data)
+        ret_data = anno_ins.forward('a boy', seed=2025)
+        print('ltx_en:', ret_data)
+    @unittest.skip('')
+    def test_annotator_prompt_extend_ds(self):
+        from vace.annotators.utils import get_annotator
+        # export DASH_API_KEY=''
+        anno_ins = get_annotator(config_type='prompt', config_task='wan_zh_ds', return_dict=False)
+        ret_data = anno_ins.forward('一位男孩', seed=2025)
+        print('wan_zh_ds:', ret_data)
+        ret_data = anno_ins.forward('a boy', seed=2025)
+        print('wan_zh_ds:', ret_data)
+# ln -s your/path/annotator_models annotator_models
+# PYTHONPATH=. python tests/test_annotators.py
+if __name__ == '__main__':
+    unittest.main()

vace/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from . import annotators
+from . import configs
+from . import models
+from . import gradios

vace/annotators/__init__.py ADDED Viewed

	@@ -0,0 +1,24 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .depth import DepthAnnotator, DepthVideoAnnotator, DepthV2VideoAnnotator
+from .flow import FlowAnnotator, FlowVisAnnotator
+from .frameref import FrameRefExtractAnnotator, FrameRefExpandAnnotator
+from .gdino import GDINOAnnotator, GDINORAMAnnotator
+from .gray import GrayAnnotator, GrayVideoAnnotator
+from .inpainting import InpaintingAnnotator, InpaintingVideoAnnotator
+from .layout import LayoutBboxAnnotator, LayoutMaskAnnotator, LayoutTrackAnnotator
+from .maskaug import MaskAugAnnotator
+from .outpainting import OutpaintingAnnotator, OutpaintingInnerAnnotator, OutpaintingVideoAnnotator, OutpaintingInnerVideoAnnotator
+from .pose import PoseBodyFaceAnnotator, PoseBodyFaceVideoAnnotator, PoseAnnotator, PoseBodyVideoAnnotator, PoseBodyAnnotator
+from .ram import RAMAnnotator
+from .salient import SalientAnnotator, SalientVideoAnnotator
+from .sam import SAMImageAnnotator
+from .sam2 import SAM2ImageAnnotator, SAM2VideoAnnotator, SAM2SalientVideoAnnotator, SAM2GDINOVideoAnnotator
+from .scribble import ScribbleAnnotator, ScribbleVideoAnnotator
+from .face import FaceAnnotator
+from .subject import SubjectAnnotator
+from .common import PlainImageAnnotator, PlainMaskAnnotator, PlainMaskAugAnnotator, PlainMaskVideoAnnotator, PlainVideoAnnotator, PlainMaskAugVideoAnnotator, PlainMaskAugInvertAnnotator, PlainMaskAugInvertVideoAnnotator, ExpandMaskVideoAnnotator
+from .prompt_extend import PromptExtendAnnotator
+from .composition import CompositionAnnotator, ReferenceAnythingAnnotator, AnimateAnythingAnnotator, SwapAnythingAnnotator, ExpandAnythingAnnotator, MoveAnythingAnnotator
+from .mask import MaskDrawAnnotator
+from .canvas import RegionCanvasAnnotator

vace/annotators/canvas.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import random
+import cv2
+import numpy as np
+from .utils import convert_to_numpy
+class RegionCanvasAnnotator:
+    def __init__(self, cfg, device=None):
+        self.scale_range = cfg.get('SCALE_RANGE', [0.75, 1.0])
+        self.canvas_value = cfg.get('CANVAS_VALUE', 255)
+        self.use_resize = cfg.get('USE_RESIZE', True)
+        self.use_canvas = cfg.get('USE_CANVAS', True)
+        self.use_aug = cfg.get('USE_AUG', False)
+        if self.use_aug:
+            from .maskaug import MaskAugAnnotator
+            self.maskaug_anno = MaskAugAnnotator(cfg={})
+    def forward(self, image, mask, mask_cfg=None):
+        image = convert_to_numpy(image)
+        mask = convert_to_numpy(mask)
+        image_h, image_w = image.shape[:2]
+        if self.use_aug:
+            mask = self.maskaug_anno.forward(mask, mask_cfg)
+        # get region with white bg
+        image[np.array(mask) == 0] = self.canvas_value
+        x, y, w, h = cv2.boundingRect(mask)
+        region_crop = image[y:y + h, x:x + w]
+        if self.use_resize:
+            # resize region
+            scale_min, scale_max = self.scale_range
+            scale_factor = random.uniform(scale_min, scale_max)
+            new_w, new_h = int(image_w * scale_factor), int(image_h * scale_factor)
+            obj_scale_factor = min(new_w/w, new_h/h)
+            new_w = int(w * obj_scale_factor)
+            new_h = int(h * obj_scale_factor)
+            region_crop_resized = cv2.resize(region_crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
+        else:
+            region_crop_resized = region_crop
+        if self.use_canvas:
+            # plot region into canvas
+            new_canvas = np.ones_like(image) * self.canvas_value
+            max_x = max(0, image_w - new_w)
+            max_y = max(0, image_h - new_h)
+            new_x = random.randint(0, max_x)
+            new_y = random.randint(0, max_y)
+            new_canvas[new_y:new_y + new_h, new_x:new_x + new_w] = region_crop_resized
+        else:
+            new_canvas = region_crop_resized
+        return new_canvas

vace/annotators/common.py ADDED Viewed

	@@ -0,0 +1,62 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+class PlainImageAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, image):
+        return image
+class PlainVideoAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, frames):
+        return frames
+class PlainMaskAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, mask):
+        return mask
+class PlainMaskAugInvertAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, mask):
+        return 255 - mask
+class PlainMaskAugAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, mask):
+        return mask
+class PlainMaskVideoAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, mask):
+        return mask
+class PlainMaskAugVideoAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, masks):
+        return masks
+class PlainMaskAugInvertVideoAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, masks):
+        return [255 - mask for mask in masks]
+class ExpandMaskVideoAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, mask, expand_num):
+        return [mask] * expand_num
+class PlainPromptAnnotator:
+    def __init__(self, cfg):
+        pass
+    def forward(self, prompt):
+        return prompt

vace/annotators/composition.py ADDED Viewed

	@@ -0,0 +1,155 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+class CompositionAnnotator:
+    def __init__(self, cfg):
+        self.process_types = ["repaint", "extension", "control"]
+        self.process_map = {
+            "repaint": "repaint",
+            "extension": "extension",
+            "control": "control",
+            "inpainting": "repaint",
+            "outpainting": "repaint",
+            "frameref": "extension",
+            "clipref": "extension",
+            "depth": "control",
+            "flow": "control",
+            "gray": "control",
+            "pose": "control",
+            "scribble": "control",
+            "layout": "control"
+        }
+    def forward(self, process_type_1, process_type_2, frames_1, frames_2, masks_1, masks_2):
+        total_frames = min(len(frames_1), len(frames_2), len(masks_1), len(masks_2))
+        combine_type = (self.process_map[process_type_1], self.process_map[process_type_2])
+        if combine_type in [("extension", "repaint"), ("extension", "control"), ("extension", "extension")]:
+            output_video = [frames_2[i] * masks_1[i] + frames_1[i] * (1 - masks_1[i]) for i in range(total_frames)]
+            output_mask = [masks_1[i] * masks_2[i] * 255 for i in range(total_frames)]
+        elif combine_type in [("repaint", "extension"), ("control", "extension"), ("repaint", "repaint")]:
+            output_video = [frames_1[i] * (1 - masks_2[i]) + frames_2[i] * masks_2[i] for i in range(total_frames)]
+            output_mask = [(masks_1[i] * (1 - masks_2[i]) + masks_2[i] * masks_2[i]) * 255 for i in range(total_frames)]
+        elif combine_type in [("repaint", "control"), ("control", "repaint")]:
+            if combine_type in [("control", "repaint")]:
+                frames_1, frames_2, masks_1, masks_2 = frames_2, frames_1, masks_2, masks_1
+            output_video = [frames_1[i] * (1 - masks_1[i]) + frames_2[i] * masks_1[i] for i in range(total_frames)]
+            output_mask = [masks_1[i] * 255 for i in range(total_frames)]
+        elif combine_type in [("control", "control")]:  # apply masks_2
+            output_video = [frames_1[i] * (1 - masks_2[i]) + frames_2[i] * masks_2[i] for i in range(total_frames)]
+            output_mask = [(masks_1[i] * (1 - masks_2[i]) + masks_2[i] * masks_2[i]) * 255 for i in range(total_frames)]
+        else:
+            raise Exception("Unknown combine type")
+        return output_video, output_mask
+class ReferenceAnythingAnnotator:
+    def __init__(self, cfg):
+        from .subject import SubjectAnnotator
+        self.sbjref_ins = SubjectAnnotator(cfg['SUBJECT'] if 'SUBJECT' in cfg else cfg)
+        self.key_map = {
+            "image": "images",
+            "mask": "masks"
+        }
+    def forward(self, images, mode=None, return_mask=None, mask_cfg=None):
+        ret_data = {}
+        for image in images:
+            ret_one_data = self.sbjref_ins.forward(image=image, mode=mode, return_mask=return_mask, mask_cfg=mask_cfg)
+            if isinstance(ret_one_data, dict):
+                for key, val in ret_one_data.items():
+                    if key in self.key_map:
+                        new_key = self.key_map[key]
+                    else:
+                        continue
+                    if new_key in ret_data:
+                        ret_data[new_key].append(val)
+                    else:
+                        ret_data[new_key] = [val]
+            else:
+                if 'images' in ret_data:
+                    ret_data['images'].append(ret_data)
+                else:
+                    ret_data['images'] = [ret_data]
+        return ret_data
+class AnimateAnythingAnnotator:
+    def __init__(self, cfg):
+        from .pose import PoseBodyFaceVideoAnnotator
+        self.pose_ins = PoseBodyFaceVideoAnnotator(cfg['POSE'])
+        self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
+    def forward(self, frames=None, images=None, mode=None, return_mask=None, mask_cfg=None):
+        ret_data = {}
+        ret_pose_data = self.pose_ins.forward(frames=frames)
+        ret_data.update({"frames": ret_pose_data})
+        ret_ref_data = self.ref_ins.forward(images=images, mode=mode, return_mask=return_mask, mask_cfg=mask_cfg)
+        ret_data.update({"images": ret_ref_data['images']})
+        return ret_data
+class SwapAnythingAnnotator:
+    def __init__(self, cfg):
+        from .inpainting import InpaintingVideoAnnotator
+        self.inp_ins = InpaintingVideoAnnotator(cfg['INPAINTING'])
+        self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
+    def forward(self, video=None, frames=None, images=None, mode=None, mask=None, bbox=None, label=None, caption=None, return_mask=None, mask_cfg=None):
+        ret_data = {}
+        mode = mode.split(',') if ',' in mode else [mode, mode]
+        ret_inp_data = self.inp_ins.forward(video=video, frames=frames, mode=mode[0], mask=mask, bbox=bbox, label=label, caption=caption, mask_cfg=mask_cfg)
+        ret_data.update(ret_inp_data)
+        ret_ref_data = self.ref_ins.forward(images=images, mode=mode[1], return_mask=return_mask, mask_cfg=mask_cfg)
+        ret_data.update({"images": ret_ref_data['images']})
+        return ret_data
+class ExpandAnythingAnnotator:
+    def __init__(self, cfg):
+        from .outpainting import OutpaintingAnnotator
+        from .frameref import FrameRefExpandAnnotator
+        self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
+        self.frameref_ins = FrameRefExpandAnnotator(cfg['FRAMEREF'])
+        self.outpainting_ins = OutpaintingAnnotator(cfg['OUTPAINTING'])
+    def forward(self, images=None, mode=None, return_mask=None, mask_cfg=None, direction=None, expand_ratio=None, expand_num=None):
+        ret_data = {}
+        expand_image, reference_image= images[0], images[1:]
+        mode = mode.split(',') if ',' in mode else ['firstframe', mode]
+        outpainting_data = self.outpainting_ins.forward(expand_image,expand_ratio=expand_ratio, direction=direction)
+        outpainting_image, outpainting_mask = outpainting_data['image'], outpainting_data['mask']
+        frameref_data = self.frameref_ins.forward(outpainting_image,  mode=mode[0], expand_num=expand_num)
+        frames, masks = frameref_data['frames'], frameref_data['masks']
+        masks[0] = outpainting_mask
+        ret_data.update({"frames": frames, "masks": masks})
+        ret_ref_data = self.ref_ins.forward(images=reference_image, mode=mode[1], return_mask=return_mask, mask_cfg=mask_cfg)
+        ret_data.update({"images": ret_ref_data['images']})
+        return ret_data
+class MoveAnythingAnnotator:
+    def __init__(self, cfg):
+        from .layout import LayoutBboxAnnotator
+        self.layout_bbox_ins = LayoutBboxAnnotator(cfg['LAYOUTBBOX'])
+    def forward(self, image=None, bbox=None, label=None, expand_num=None):
+        frame_size = image.shape[:2]   # [H, W]
+        ret_layout_data = self.layout_bbox_ins.forward(bbox, frame_size=frame_size, num_frames=expand_num, label=label)
+        out_frames = [image] + ret_layout_data
+        out_mask = [np.zeros(frame_size, dtype=np.uint8)] + [np.ones(frame_size, dtype=np.uint8) * 255] * len(ret_layout_data)
+        ret_data = {
+            "frames": out_frames,
+            "masks": out_mask
+        }
+        return ret_data

vace/annotators/depth.py ADDED Viewed

	@@ -0,0 +1,88 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import numpy as np
+import torch
+from einops import rearrange
+from .utils import convert_to_numpy, resize_image, resize_image_ori
+class DepthAnnotator:
+    def __init__(self, cfg, device=None):
+        from .midas.api import MiDaSInference
+        pretrained_model = cfg['PRETRAINED_MODEL']
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.model = MiDaSInference(model_type='dpt_hybrid', model_path=pretrained_model).to(self.device)
+        self.a = cfg.get('A', np.pi * 2.0)
+        self.bg_th = cfg.get('BG_TH', 0.1)
+    @torch.no_grad()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        image = convert_to_numpy(image)
+        image_depth = image
+        h, w, c = image.shape
+        image_depth, k = resize_image(image_depth,
+                                      1024 if min(h, w) > 1024 else min(h, w))
+        image_depth = torch.from_numpy(image_depth).float().to(self.device)
+        image_depth = image_depth / 127.5 - 1.0
+        image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
+        depth = self.model(image_depth)[0]
+        depth_pt = depth.clone()
+        depth_pt -= torch.min(depth_pt)
+        depth_pt /= torch.max(depth_pt)
+        depth_pt = depth_pt.cpu().numpy()
+        depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
+        depth_image = depth_image[..., None].repeat(3, 2)
+        depth_image = resize_image_ori(h, w, depth_image, k)
+        return depth_image
+class DepthVideoAnnotator(DepthAnnotator):
+    def forward(self, frames):
+        ret_frames = []
+        for frame in frames:
+            anno_frame = super().forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        return ret_frames
+class DepthV2Annotator:
+    def __init__(self, cfg, device=None):
+        from .depth_anything_v2.dpt import DepthAnythingV2
+        pretrained_model = cfg['PRETRAINED_MODEL']
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
+        self.model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(self.device)
+        self.model.load_state_dict(
+            torch.load(
+                pretrained_model,
+                map_location=self.device
+            )
+        )
+        self.model.eval()
+    @torch.inference_mode()
+    @torch.autocast('cuda', enabled=False)
+    def forward(self, image):
+        image = convert_to_numpy(image)
+        depth = self.model.infer_image(image)
+        depth_pt = depth.copy()
+        depth_pt -= np.min(depth_pt)
+        depth_pt /= np.max(depth_pt)
+        depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
+        depth_image = depth_image[..., np.newaxis]
+        depth_image = np.repeat(depth_image, 3, axis=2)
+        return depth_image
+class DepthV2VideoAnnotator(DepthV2Annotator):
+    def forward(self, frames):
+        ret_frames = []
+        for frame in frames:
+            anno_frame = super().forward(np.array(frame))
+            ret_frames.append(anno_frame)
+        return ret_frames

vace/annotators/depth_anything_v2/__init__.py ADDED Viewed

File without changes

vace/annotators/depth_anything_v2/dinov2.py ADDED Viewed

	@@ -0,0 +1,414 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+#
+# This source code is licensed under the Apache License, Version 2.0
+# found in the LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+from functools import partial
+import math
+import logging
+from typing import Sequence, Tuple, Union, Callable
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from torch.nn.init import trunc_normal_
+from .layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
+logger = logging.getLogger("dinov2")
+def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
+    if not depth_first and include_root:
+        fn(module=module, name=name)
+    for child_name, child_module in module.named_children():
+        child_name = ".".join((name, child_name)) if name else child_name
+        named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
+    if depth_first and include_root:
+        fn(module=module, name=name)
+    return module
+class BlockChunk(nn.ModuleList):
+    def forward(self, x):
+        for b in self:
+            x = b(x)
+        return x
+class DinoVisionTransformer(nn.Module):
+    def __init__(
+            self,
+            img_size=224,
+            patch_size=16,
+            in_chans=3,
+            embed_dim=768,
+            depth=12,
+            num_heads=12,
+            mlp_ratio=4.0,
+            qkv_bias=True,
+            ffn_bias=True,
+            proj_bias=True,
+            drop_path_rate=0.0,
+            drop_path_uniform=False,
+            init_values=None,  # for layerscale: None or 0 => no layerscale
+            embed_layer=PatchEmbed,
+            act_layer=nn.GELU,
+            block_fn=Block,
+            ffn_layer="mlp",
+            block_chunks=1,
+            num_register_tokens=0,
+            interpolate_antialias=False,
+            interpolate_offset=0.1,
+    ):
+        """
+        Args:
+            img_size (int, tuple): input image size
+            patch_size (int, tuple): patch size
+            in_chans (int): number of input channels
+            embed_dim (int): embedding dimension
+            depth (int): depth of transformer
+            num_heads (int): number of attention heads
+            mlp_ratio (int): ratio of mlp hidden dim to embedding dim
+            qkv_bias (bool): enable bias for qkv if True
+            proj_bias (bool): enable bias for proj in attn if True
+            ffn_bias (bool): enable bias for ffn if True
+            drop_path_rate (float): stochastic depth rate
+            drop_path_uniform (bool): apply uniform drop rate across blocks
+            weight_init (str): weight init scheme
+            init_values (float): layer-scale init values
+            embed_layer (nn.Module): patch embedding layer
+            act_layer (nn.Module): MLP activation layer
+            block_fn (nn.Module): transformer block class
+            ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
+            block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
+            num_register_tokens: (int) number of extra cls tokens (so-called "registers")
+            interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
+            interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
+        """
+        super().__init__()
+        norm_layer = partial(nn.LayerNorm, eps=1e-6)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        self.num_tokens = 1
+        self.n_blocks = depth
+        self.num_heads = num_heads
+        self.patch_size = patch_size
+        self.num_register_tokens = num_register_tokens
+        self.interpolate_antialias = interpolate_antialias
+        self.interpolate_offset = interpolate_offset
+        self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
+        num_patches = self.patch_embed.num_patches
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
+        self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
+        assert num_register_tokens >= 0
+        self.register_tokens = (
+            nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
+        )
+        if drop_path_uniform is True:
+            dpr = [drop_path_rate] * depth
+        else:
+            dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)]  # stochastic depth decay rule
+        if ffn_layer == "mlp":
+            logger.info("using MLP layer as FFN")
+            ffn_layer = Mlp
+        elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
+            logger.info("using SwiGLU layer as FFN")
+            ffn_layer = SwiGLUFFNFused
+        elif ffn_layer == "identity":
+            logger.info("using Identity layer as FFN")
+            def f(*args, **kwargs):
+                return nn.Identity()
+            ffn_layer = f
+        else:
+            raise NotImplementedError
+        blocks_list = [
+            block_fn(
+                dim=embed_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                proj_bias=proj_bias,
+                ffn_bias=ffn_bias,
+                drop_path=dpr[i],
+                norm_layer=norm_layer,
+                act_layer=act_layer,
+                ffn_layer=ffn_layer,
+                init_values=init_values,
+            )
+            for i in range(depth)
+        ]
+        if block_chunks > 0:
+            self.chunked_blocks = True
+            chunked_blocks = []
+            chunksize = depth // block_chunks
+            for i in range(0, depth, chunksize):
+                # this is to keep the block index consistent if we chunk the block list
+                chunked_blocks.append([nn.Identity()] * i + blocks_list[i: i + chunksize])
+            self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
+        else:
+            self.chunked_blocks = False
+            self.blocks = nn.ModuleList(blocks_list)
+        self.norm = norm_layer(embed_dim)
+        self.head = nn.Identity()
+        self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
+        self.init_weights()
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.cls_token, std=1e-6)
+        if self.register_tokens is not None:
+            nn.init.normal_(self.register_tokens, std=1e-6)
+        named_apply(init_weights_vit_timm, self)
+    def interpolate_pos_encoding(self, x, w, h):
+        previous_dtype = x.dtype
+        npatch = x.shape[1] - 1
+        N = self.pos_embed.shape[1] - 1
+        if npatch == N and w == h:
+            return self.pos_embed
+        pos_embed = self.pos_embed.float()
+        class_pos_embed = pos_embed[:, 0]
+        patch_pos_embed = pos_embed[:, 1:]
+        dim = x.shape[-1]
+        w0 = w // self.patch_size
+        h0 = h // self.patch_size
+        # we add a small number to avoid floating point error in the interpolation
+        # see discussion at https://github.com/facebookresearch/dino/issues/8
+        # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
+        w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
+        # w0, h0 = w0 + 0.1, h0 + 0.1
+        sqrt_N = math.sqrt(N)
+        sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
+            scale_factor=(sx, sy),
+            # (int(w0), int(h0)), # to solve the upsampling shape issue
+            mode="bicubic",
+            antialias=self.interpolate_antialias
+        )
+        assert int(w0) == patch_pos_embed.shape[-2]
+        assert int(h0) == patch_pos_embed.shape[-1]
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
+    def prepare_tokens_with_masks(self, x, masks=None):
+        B, nc, w, h = x.shape
+        x = self.patch_embed(x)
+        if masks is not None:
+            x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
+        x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
+        x = x + self.interpolate_pos_encoding(x, w, h)
+        if self.register_tokens is not None:
+            x = torch.cat(
+                (
+                    x[:, :1],
+                    self.register_tokens.expand(x.shape[0], -1, -1),
+                    x[:, 1:],
+                ),
+                dim=1,
+            )
+        return x
+    def forward_features_list(self, x_list, masks_list):
+        x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
+        for blk in self.blocks:
+            x = blk(x)
+        all_x = x
+        output = []
+        for x, masks in zip(all_x, masks_list):
+            x_norm = self.norm(x)
+            output.append(
+                {
+                    "x_norm_clstoken": x_norm[:, 0],
+                    "x_norm_regtokens": x_norm[:, 1: self.num_register_tokens + 1],
+                    "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1:],
+                    "x_prenorm": x,
+                    "masks": masks,
+                }
+            )
+        return output
+    def forward_features(self, x, masks=None):
+        if isinstance(x, list):
+            return self.forward_features_list(x, masks)
+        x = self.prepare_tokens_with_masks(x, masks)
+        for blk in self.blocks:
+            x = blk(x)
+        x_norm = self.norm(x)
+        return {
+            "x_norm_clstoken": x_norm[:, 0],
+            "x_norm_regtokens": x_norm[:, 1: self.num_register_tokens + 1],
+            "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1:],
+            "x_prenorm": x,
+            "masks": masks,
+        }
+    def _get_intermediate_layers_not_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        # If n is an int, take the n last blocks. If it's a list, take them
+        output, total_block_len = [], len(self.blocks)
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for i, blk in enumerate(self.blocks):
+            x = blk(x)
+            if i in blocks_to_take:
+                output.append(x)
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def _get_intermediate_layers_chunked(self, x, n=1):
+        x = self.prepare_tokens_with_masks(x)
+        output, i, total_block_len = [], 0, len(self.blocks[-1])
+        # If n is an int, take the n last blocks. If it's a list, take them
+        blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
+        for block_chunk in self.blocks:
+            for blk in block_chunk[i:]:  # Passing the nn.Identity()
+                x = blk(x)
+                if i in blocks_to_take:
+                    output.append(x)
+                i += 1
+        assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
+        return output
+    def get_intermediate_layers(
+            self,
+            x: torch.Tensor,
+            n: Union[int, Sequence] = 1,  # Layers or n last layers to take
+            reshape: bool = False,
+            return_class_token: bool = False,
+            norm=True
+    ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
+        if self.chunked_blocks:
+            outputs = self._get_intermediate_layers_chunked(x, n)
+        else:
+            outputs = self._get_intermediate_layers_not_chunked(x, n)
+        if norm:
+            outputs = [self.norm(out) for out in outputs]
+        class_tokens = [out[:, 0] for out in outputs]
+        outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
+        if reshape:
+            B, _, w, h = x.shape
+            outputs = [
+                out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
+                for out in outputs
+            ]
+        if return_class_token:
+            return tuple(zip(outputs, class_tokens))
+        return tuple(outputs)
+    def forward(self, *args, is_training=False, **kwargs):
+        ret = self.forward_features(*args, **kwargs)
+        if is_training:
+            return ret
+        else:
+            return self.head(ret["x_norm_clstoken"])
+def init_weights_vit_timm(module: nn.Module, name: str = ""):
+    """ViT weight initialization, original timm impl (for reproducibility)"""
+    if isinstance(module, nn.Linear):
+        trunc_normal_(module.weight, std=0.02)
+        if module.bias is not None:
+            nn.init.zeros_(module.bias)
+def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=384,
+        depth=12,
+        num_heads=6,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1024,
+        depth=24,
+        num_heads=16,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
+    """
+    Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
+    """
+    model = DinoVisionTransformer(
+        patch_size=patch_size,
+        embed_dim=1536,
+        depth=40,
+        num_heads=24,
+        mlp_ratio=4,
+        block_fn=partial(Block, attn_class=MemEffAttention),
+        num_register_tokens=num_register_tokens,
+        **kwargs,
+    )
+    return model
+def DINOv2(model_name):
+    model_zoo = {
+        "vits": vit_small,
+        "vitb": vit_base,
+        "vitl": vit_large,
+        "vitg": vit_giant2
+    }
+    return model_zoo[model_name](
+        img_size=518,
+        patch_size=14,
+        init_values=1.0,
+        ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
+        block_chunks=0,
+        num_register_tokens=0,
+        interpolate_antialias=False,
+        interpolate_offset=0.1
+    )

vace/annotators/depth_anything_v2/dpt.py ADDED Viewed

	@@ -0,0 +1,210 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import cv2
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchvision.transforms import Compose
+from .dinov2 import DINOv2
+from .util.blocks import FeatureFusionBlock, _make_scratch
+from .util.transform import Resize, NormalizeImage, PrepareForNet
+class DepthAnythingV2(nn.Module):
+    def __init__(
+            self,
+            encoder='vitl',
+            features=256,
+            out_channels=[256, 512, 1024, 1024],
+            use_bn=False,
+            use_clstoken=False
+    ):
+        super(DepthAnythingV2, self).__init__()
+        self.intermediate_layer_idx = {
+            'vits': [2, 5, 8, 11],
+            'vitb': [2, 5, 8, 11],
+            'vitl': [4, 11, 17, 23],
+            'vitg': [9, 19, 29, 39]
+        }
+        self.encoder = encoder
+        self.pretrained = DINOv2(model_name=encoder)
+        self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels,
+                                  use_clstoken=use_clstoken)
+    def forward(self, x):
+        patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
+        features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder],
+                                                           return_class_token=True)
+        depth = self.depth_head(features, patch_h, patch_w)
+        depth = F.relu(depth)
+        return depth.squeeze(1)
+    @torch.no_grad()
+    def infer_image(self, raw_image, input_size=518):
+        image, (h, w) = self.image2tensor(raw_image, input_size)
+        depth = self.forward(image)
+        depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
+        return depth.cpu().numpy()
+    def image2tensor(self, raw_image, input_size=518):
+        transform = Compose([
+            Resize(
+                width=input_size,
+                height=input_size,
+                resize_target=False,
+                keep_aspect_ratio=True,
+                ensure_multiple_of=14,
+                resize_method='lower_bound',
+                image_interpolation_method=cv2.INTER_CUBIC,
+            ),
+            NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            PrepareForNet(),
+        ])
+        h, w = raw_image.shape[:2]
+        image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
+        image = transform({'image': image})['image']
+        image = torch.from_numpy(image).unsqueeze(0)
+        DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
+        image = image.to(DEVICE)
+        return image, (h, w)
+class DPTHead(nn.Module):
+    def __init__(
+            self,
+            in_channels,
+            features=256,
+            use_bn=False,
+            out_channels=[256, 512, 1024, 1024],
+            use_clstoken=False
+    ):
+        super(DPTHead, self).__init__()
+        self.use_clstoken = use_clstoken
+        self.projects = nn.ModuleList([
+            nn.Conv2d(
+                in_channels=in_channels,
+                out_channels=out_channel,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+            ) for out_channel in out_channels
+        ])
+        self.resize_layers = nn.ModuleList([
+            nn.ConvTranspose2d(
+                in_channels=out_channels[0],
+                out_channels=out_channels[0],
+                kernel_size=4,
+                stride=4,
+                padding=0),
+            nn.ConvTranspose2d(
+                in_channels=out_channels[1],
+                out_channels=out_channels[1],
+                kernel_size=2,
+                stride=2,
+                padding=0),
+            nn.Identity(),
+            nn.Conv2d(
+                in_channels=out_channels[3],
+                out_channels=out_channels[3],
+                kernel_size=3,
+                stride=2,
+                padding=1)
+        ])
+        if use_clstoken:
+            self.readout_projects = nn.ModuleList()
+            for _ in range(len(self.projects)):
+                self.readout_projects.append(
+                    nn.Sequential(
+                        nn.Linear(2 * in_channels, in_channels),
+                        nn.GELU()))
+        self.scratch = _make_scratch(
+            out_channels,
+            features,
+            groups=1,
+            expand=False,
+        )
+        self.scratch.stem_transpose = None
+        self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
+        self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
+        head_features_1 = features
+        head_features_2 = 32
+        self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
+        self.scratch.output_conv2 = nn.Sequential(
+            nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(True),
+            nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
+            nn.ReLU(True),
+            nn.Identity(),
+        )
+    def forward(self, out_features, patch_h, patch_w):
+        out = []
+        for i, x in enumerate(out_features):
+            if self.use_clstoken:
+                x, cls_token = x[0], x[1]
+                readout = cls_token.unsqueeze(1).expand_as(x)
+                x = self.readout_projects[i](torch.cat((x, readout), -1))
+            else:
+                x = x[0]
+            x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
+            x = self.projects[i](x)
+            x = self.resize_layers[i](x)
+            out.append(x)
+        layer_1, layer_2, layer_3, layer_4 = out
+        layer_1_rn = self.scratch.layer1_rn(layer_1)
+        layer_2_rn = self.scratch.layer2_rn(layer_2)
+        layer_3_rn = self.scratch.layer3_rn(layer_3)
+        layer_4_rn = self.scratch.layer4_rn(layer_4)
+        path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
+        path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
+        path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
+        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
+        out = self.scratch.output_conv1(path_1)
+        out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
+        out = self.scratch.output_conv2(out)
+        return out
+def _make_fusion_block(features, use_bn, size=None):
+    return FeatureFusionBlock(
+        features,
+        nn.ReLU(False),
+        deconv=False,
+        bn=use_bn,
+        expand=False,
+        align_corners=True,
+        size=size,
+    )

vace/annotators/depth_anything_v2/layers/__init__.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from .mlp import Mlp
+from .patch_embed import PatchEmbed
+from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
+from .block import NestedTensorBlock
+from .attention import MemEffAttention

vace/annotators/depth_anything_v2/layers/attention.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
+import logging
+from torch import Tensor
+from torch import nn
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import memory_efficient_attention, unbind, fmha
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Attention(nn.Module):
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            proj_bias: bool = True,
+            attn_drop: float = 0.0,
+            proj_drop: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = head_dim ** -0.5
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim, bias=proj_bias)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: Tensor) -> Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
+        attn = q @ k.transpose(-2, -1)
+        attn = attn.softmax(dim=-1)
+        attn = self.attn_drop(attn)
+        x = (attn @ v).transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class MemEffAttention(Attention):
+    def forward(self, x: Tensor, attn_bias=None) -> Tensor:
+        if not XFORMERS_AVAILABLE:
+            assert attn_bias is None, "xFormers is required for nested tensors usage"
+            return super().forward(x)
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
+        q, k, v = unbind(qkv, 2)
+        x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
+        x = x.reshape([B, N, C])
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x

vace/annotators/depth_anything_v2/layers/block.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+import logging
+from typing import Callable, List, Any, Tuple, Dict
+import torch
+from torch import nn, Tensor
+from .attention import Attention, MemEffAttention
+from .drop_path import DropPath
+from .layer_scale import LayerScale
+from .mlp import Mlp
+logger = logging.getLogger("dinov2")
+try:
+    from xformers.ops import fmha
+    from xformers.ops import scaled_index_add, index_select_cat
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    # logger.warning("xFormers not available")
+    XFORMERS_AVAILABLE = False
+class Block(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qkv_bias: bool = False,
+        proj_bias: bool = True,
+        ffn_bias: bool = True,
+        drop: float = 0.0,
+        attn_drop: float = 0.0,
+        init_values=None,
+        drop_path: float = 0.0,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
+        attn_class: Callable[..., nn.Module] = Attention,
+        ffn_layer: Callable[..., nn.Module] = Mlp,
+    ) -> None:
+        super().__init__()
+        # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
+        self.norm1 = norm_layer(dim)
+        self.attn = attn_class(
+            dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            proj_bias=proj_bias,
+            attn_drop=attn_drop,
+            proj_drop=drop,
+        )
+        self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.norm2 = norm_layer(dim)
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = ffn_layer(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_layer=act_layer,
+            drop=drop,
+            bias=ffn_bias,
+        )
+        self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
+        self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.sample_drop_ratio = drop_path
+    def forward(self, x: Tensor) -> Tensor:
+        def attn_residual_func(x: Tensor) -> Tensor:
+            return self.ls1(self.attn(self.norm1(x)))
+        def ffn_residual_func(x: Tensor) -> Tensor:
+            return self.ls2(self.mlp(self.norm2(x)))
+        if self.training and self.sample_drop_ratio > 0.1:
+            # the overhead is compensated only for a drop path rate larger than 0.1
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+            x = drop_add_residual_stochastic_depth(
+                x,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+            )
+        elif self.training and self.sample_drop_ratio > 0.0:
+            x = x + self.drop_path1(attn_residual_func(x))
+            x = x + self.drop_path1(ffn_residual_func(x))  # FIXME: drop_path2
+        else:
+            x = x + attn_residual_func(x)
+            x = x + ffn_residual_func(x)
+        return x
+def drop_add_residual_stochastic_depth(
+    x: Tensor,
+    residual_func: Callable[[Tensor], Tensor],
+    sample_drop_ratio: float = 0.0,
+) -> Tensor:
+    # 1) extract subset using permutation
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    x_subset = x[brange]
+    # 2) apply residual_func to get residual
+    residual = residual_func(x_subset)
+    x_flat = x.flatten(1)
+    residual = residual.flatten(1)
+    residual_scale_factor = b / sample_subset_size
+    # 3) add the residual
+    x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    return x_plus_residual.view_as(x)
+def get_branges_scales(x, sample_drop_ratio=0.0):
+    b, n, d = x.shape
+    sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
+    brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
+    residual_scale_factor = b / sample_subset_size
+    return brange, residual_scale_factor
+def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
+    if scaling_vector is None:
+        x_flat = x.flatten(1)
+        residual = residual.flatten(1)
+        x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
+    else:
+        x_plus_residual = scaled_index_add(
+            x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
+        )
+    return x_plus_residual
+attn_bias_cache: Dict[Tuple, Any] = {}
+def get_attn_bias_and_cat(x_list, branges=None):
+    """
+    this will perform the index select, cat the tensors, and provide the attn_bias from cache
+    """
+    batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
+    all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
+    if all_shapes not in attn_bias_cache.keys():
+        seqlens = []
+        for b, x in zip(batch_sizes, x_list):
+            for _ in range(b):
+                seqlens.append(x.shape[1])
+        attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
+        attn_bias._batch_sizes = batch_sizes
+        attn_bias_cache[all_shapes] = attn_bias
+    if branges is not None:
+        cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
+    else:
+        tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
+        cat_tensors = torch.cat(tensors_bs1, dim=1)
+    return attn_bias_cache[all_shapes], cat_tensors
+def drop_add_residual_stochastic_depth_list(
+    x_list: List[Tensor],
+    residual_func: Callable[[Tensor, Any], Tensor],
+    sample_drop_ratio: float = 0.0,
+    scaling_vector=None,
+) -> Tensor:
+    # 1) generate random set of indices for dropping samples in the batch
+    branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
+    branges = [s[0] for s in branges_scales]
+    residual_scale_factors = [s[1] for s in branges_scales]
+    # 2) get attention bias and index+concat the tensors
+    attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
+    # 3) apply residual_func to get residual, and split the result
+    residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias))  # type: ignore
+    outputs = []
+    for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
+        outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
+    return outputs
+class NestedTensorBlock(Block):
+    def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
+        """
+        x_list contains a list of tensors to nest together and run
+        """
+        assert isinstance(self.attn, MemEffAttention)
+        if self.training and self.sample_drop_ratio > 0.0:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.attn(self.norm1(x), attn_bias=attn_bias)
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.mlp(self.norm2(x))
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=attn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            x_list = drop_add_residual_stochastic_depth_list(
+                x_list,
+                residual_func=ffn_residual_func,
+                sample_drop_ratio=self.sample_drop_ratio,
+                scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
+            )
+            return x_list
+        else:
+            def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
+            def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
+                return self.ls2(self.mlp(self.norm2(x)))
+            attn_bias, x = get_attn_bias_and_cat(x_list)
+            x = x + attn_residual_func(x, attn_bias=attn_bias)
+            x = x + ffn_residual_func(x)
+            return attn_bias.split(x)
+    def forward(self, x_or_x_list):
+        if isinstance(x_or_x_list, Tensor):
+            return super().forward(x_or_x_list)
+        elif isinstance(x_or_x_list, list):
+            assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
+            return self.forward_nested(x_or_x_list)
+        else:
+            raise AssertionError

vace/annotators/depth_anything_v2/layers/drop_path.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
+from torch import nn
+def drop_path(x, drop_prob: float = 0.0, training: bool = False):
+    if drop_prob == 0.0 or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0],) + (1,) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
+    if keep_prob > 0.0:
+        random_tensor.div_(keep_prob)
+    output = x * random_tensor
+    return output
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)

vace/annotators/depth_anything_v2/layers/layer_scale.py ADDED Viewed

	@@ -0,0 +1,28 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
+from typing import Union
+import torch
+from torch import Tensor
+from torch import nn
+class LayerScale(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        init_values: Union[float, Tensor] = 1e-5,
+        inplace: bool = False,
+    ) -> None:
+        super().__init__()
+        self.inplace = inplace
+        self.gamma = nn.Parameter(init_values * torch.ones(dim))
+    def forward(self, x: Tensor) -> Tensor:
+        return x.mul_(self.gamma) if self.inplace else x * self.gamma

vace/annotators/depth_anything_v2/layers/mlp.py ADDED Viewed

	@@ -0,0 +1,39 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
+from typing import Callable, Optional
+from torch import Tensor, nn
+class Mlp(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = nn.GELU,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
+        self.act = act_layer()
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
+        self.drop = nn.Dropout(drop)
+    def forward(self, x: Tensor) -> Tensor:
+        x = self.fc1(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x

vace/annotators/depth_anything_v2/layers/patch_embed.py ADDED Viewed

	@@ -0,0 +1,90 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+# References:
+#   https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
+#   https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
+from typing import Callable, Optional, Tuple, Union
+from torch import Tensor
+import torch.nn as nn
+def make_2tuple(x):
+    if isinstance(x, tuple):
+        assert len(x) == 2
+        return x
+    assert isinstance(x, int)
+    return (x, x)
+class PatchEmbed(nn.Module):
+    """
+    2D image to patch embedding: (B,C,H,W) -> (B,N,D)
+    Args:
+        img_size: Image size.
+        patch_size: Patch token size.
+        in_chans: Number of input image channels.
+        embed_dim: Number of linear projection output channels.
+        norm_layer: Normalization layer.
+    """
+    def __init__(
+        self,
+        img_size: Union[int, Tuple[int, int]] = 224,
+        patch_size: Union[int, Tuple[int, int]] = 16,
+        in_chans: int = 3,
+        embed_dim: int = 768,
+        norm_layer: Optional[Callable] = None,
+        flatten_embedding: bool = True,
+    ) -> None:
+        super().__init__()
+        image_HW = make_2tuple(img_size)
+        patch_HW = make_2tuple(patch_size)
+        patch_grid_size = (
+            image_HW[0] // patch_HW[0],
+            image_HW[1] // patch_HW[1],
+        )
+        self.img_size = image_HW
+        self.patch_size = patch_HW
+        self.patches_resolution = patch_grid_size
+        self.num_patches = patch_grid_size[0] * patch_grid_size[1]
+        self.in_chans = in_chans
+        self.embed_dim = embed_dim
+        self.flatten_embedding = flatten_embedding
+        self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
+        self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
+    def forward(self, x: Tensor) -> Tensor:
+        _, _, H, W = x.shape
+        patch_H, patch_W = self.patch_size
+        assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
+        assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
+        x = self.proj(x)  # B C H W
+        H, W = x.size(2), x.size(3)
+        x = x.flatten(2).transpose(1, 2)  # B HW C
+        x = self.norm(x)
+        if not self.flatten_embedding:
+            x = x.reshape(-1, H, W, self.embed_dim)  # B H W C
+        return x
+    def flops(self) -> float:
+        Ho, Wo = self.patches_resolution
+        flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
+        if self.norm is not None:
+            flops += Ho * Wo * self.embed_dim
+        return flops

vace/annotators/depth_anything_v2/layers/swiglu_ffn.py ADDED Viewed

	@@ -0,0 +1,64 @@

+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+from typing import Callable, Optional
+from torch import Tensor, nn
+import torch.nn.functional as F
+class SwiGLUFFN(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
+        self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
+    def forward(self, x: Tensor) -> Tensor:
+        x12 = self.w12(x)
+        x1, x2 = x12.chunk(2, dim=-1)
+        hidden = F.silu(x1) * x2
+        return self.w3(hidden)
+try:
+    from xformers.ops import SwiGLU
+    XFORMERS_AVAILABLE = True
+except ImportError:
+    SwiGLU = SwiGLUFFN
+    XFORMERS_AVAILABLE = False
+class SwiGLUFFNFused(SwiGLU):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        act_layer: Callable[..., nn.Module] = None,
+        drop: float = 0.0,
+        bias: bool = True,
+    ) -> None:
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+        super().__init__(
+            in_features=in_features,
+            hidden_features=hidden_features,
+            out_features=out_features,
+            bias=bias,
+        )

vace/annotators/depth_anything_v2/util/__init__.py ADDED Viewed

File without changes

vace/annotators/depth_anything_v2/util/blocks.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch.nn as nn
+def _make_scratch(in_shape, out_shape, groups=1, expand=False):
+    scratch = nn.Module()
+    out_shape1 = out_shape
+    out_shape2 = out_shape
+    out_shape3 = out_shape
+    if len(in_shape) >= 4:
+        out_shape4 = out_shape
+    if expand:
+        out_shape1 = out_shape
+        out_shape2 = out_shape * 2
+        out_shape3 = out_shape * 4
+        if len(in_shape) >= 4:
+            out_shape4 = out_shape * 8
+    scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False,
+                                  groups=groups)
+    scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False,
+                                  groups=groups)
+    scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False,
+                                  groups=groups)
+    if len(in_shape) >= 4:
+        scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False,
+                                      groups=groups)
+    return scratch
+class ResidualConvUnit(nn.Module):
+    """Residual convolution module.
+    """
+    def __init__(self, features, activation, bn):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super().__init__()
+        self.bn = bn
+        self.groups = 1
+        self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
+        if self.bn == True:
+            self.bn1 = nn.BatchNorm2d(features)
+            self.bn2 = nn.BatchNorm2d(features)
+        self.activation = activation
+        self.skip_add = nn.quantized.FloatFunctional()
+    def forward(self, x):
+        """Forward pass.
+        Args:
+            x (tensor): input
+        Returns:
+            tensor: output
+        """
+        out = self.activation(x)
+        out = self.conv1(out)
+        if self.bn == True:
+            out = self.bn1(out)
+        out = self.activation(out)
+        out = self.conv2(out)
+        if self.bn == True:
+            out = self.bn2(out)
+        if self.groups > 1:
+            out = self.conv_merge(out)
+        return self.skip_add.add(out, x)
+class FeatureFusionBlock(nn.Module):
+    """Feature fusion block.
+    """
+    def __init__(
+            self,
+            features,
+            activation,
+            deconv=False,
+            bn=False,
+            expand=False,
+            align_corners=True,
+            size=None
+    ):
+        """Init.
+        Args:
+            features (int): number of features
+        """
+        super(FeatureFusionBlock, self).__init__()
+        self.deconv = deconv
+        self.align_corners = align_corners
+        self.groups = 1
+        self.expand = expand
+        out_features = features
+        if self.expand == True:
+            out_features = features // 2
+        self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
+        self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
+        self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
+        self.skip_add = nn.quantized.FloatFunctional()
+        self.size = size
+    def forward(self, *xs, size=None):
+        """Forward pass.
+        Returns:
+            tensor: output
+        """
+        output = xs[0]
+        if len(xs) == 2:
+            res = self.resConfUnit1(xs[1])
+            output = self.skip_add.add(output, res)
+        output = self.resConfUnit2(output)
+        if (size is None) and (self.size is None):
+            modifier = {"scale_factor": 2}
+        elif size is None:
+            modifier = {"size": self.size}
+        else:
+            modifier = {"size": size}
+        output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
+        output = self.out_conv(output)
+        return output

vace/annotators/depth_anything_v2/util/transform.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import cv2
+import numpy as np
+class Resize(object):
+    """Resize sample to given size (width, height).
+    """
+    def __init__(
+            self,
+            width,
+            height,
+            resize_target=True,
+            keep_aspect_ratio=False,
+            ensure_multiple_of=1,
+            resize_method="lower_bound",
+            image_interpolation_method=cv2.INTER_AREA,
+    ):
+        """Init.
+        Args:
+            width (int): desired output width
+            height (int): desired output height
+            resize_target (bool, optional):
+                True: Resize the full sample (image, mask, target).
+                False: Resize image only.
+                Defaults to True.
+            keep_aspect_ratio (bool, optional):
+                True: Keep the aspect ratio of the input sample.
+                Output sample might not have the given width and height, and
+                resize behaviour depends on the parameter 'resize_method'.
+                Defaults to False.
+            ensure_multiple_of (int, optional):
+                Output width and height is constrained to be multiple of this parameter.
+                Defaults to 1.
+            resize_method (str, optional):
+                "lower_bound": Output will be at least as large as the given size.
+                "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
+                "minimal": Scale as least as possible.  (Output size might be smaller than given size.)
+                Defaults to "lower_bound".
+        """
+        self.__width = width
+        self.__height = height
+        self.__resize_target = resize_target
+        self.__keep_aspect_ratio = keep_aspect_ratio
+        self.__multiple_of = ensure_multiple_of
+        self.__resize_method = resize_method
+        self.__image_interpolation_method = image_interpolation_method
+    def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
+        y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if max_val is not None and y > max_val:
+            y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        if y < min_val:
+            y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
+        return y
+    def get_size(self, width, height):
+        # determine new height and width
+        scale_height = self.__height / height
+        scale_width = self.__width / width
+        if self.__keep_aspect_ratio:
+            if self.__resize_method == "lower_bound":
+                # scale such that output size is lower bound
+                if scale_width > scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "upper_bound":
+                # scale such that output size is upper bound
+                if scale_width < scale_height:
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            elif self.__resize_method == "minimal":
+                # scale as least as possbile
+                if abs(1 - scale_width) < abs(1 - scale_height):
+                    # fit width
+                    scale_height = scale_width
+                else:
+                    # fit height
+                    scale_width = scale_height
+            else:
+                raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        if self.__resize_method == "lower_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
+        elif self.__resize_method == "upper_bound":
+            new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
+            new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
+        elif self.__resize_method == "minimal":
+            new_height = self.constrain_to_multiple_of(scale_height * height)
+            new_width = self.constrain_to_multiple_of(scale_width * width)
+        else:
+            raise ValueError(f"resize_method {self.__resize_method} not implemented")
+        return (new_width, new_height)
+    def __call__(self, sample):
+        width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
+        # resize sample
+        sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
+        if self.__resize_target:
+            if "depth" in sample:
+                sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
+            if "mask" in sample:
+                sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height),
+                                            interpolation=cv2.INTER_NEAREST)
+        return sample
+class NormalizeImage(object):
+    """Normlize image by given mean and std.
+    """
+    def __init__(self, mean, std):
+        self.__mean = mean
+        self.__std = std
+    def __call__(self, sample):
+        sample["image"] = (sample["image"] - self.__mean) / self.__std
+        return sample
+class PrepareForNet(object):
+    """Prepare sample for usage as network input.
+    """
+    def __init__(self):
+        pass
+    def __call__(self, sample):
+        image = np.transpose(sample["image"], (2, 0, 1))
+        sample["image"] = np.ascontiguousarray(image).astype(np.float32)
+        if "depth" in sample:
+            depth = sample["depth"].astype(np.float32)
+            sample["depth"] = np.ascontiguousarray(depth)
+        if "mask" in sample:
+            sample["mask"] = sample["mask"].astype(np.float32)
+            sample["mask"] = np.ascontiguousarray(sample["mask"])
+        return sample

vace/annotators/dwpose/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # -- coding: utf-8 --
2	+ # Copyright (c) Alibaba, Inc. and its affiliates.

vace/annotators/dwpose/onnxdet.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import cv2
+import numpy as np
+import onnxruntime
+def nms(boxes, scores, nms_thr):
+    """Single class NMS implemented in Numpy."""
+    x1 = boxes[:, 0]
+    y1 = boxes[:, 1]
+    x2 = boxes[:, 2]
+    y2 = boxes[:, 3]
+    areas = (x2 - x1 + 1) * (y2 - y1 + 1)
+    order = scores.argsort()[::-1]
+    keep = []
+    while order.size > 0:
+        i = order[0]
+        keep.append(i)
+        xx1 = np.maximum(x1[i], x1[order[1:]])
+        yy1 = np.maximum(y1[i], y1[order[1:]])
+        xx2 = np.minimum(x2[i], x2[order[1:]])
+        yy2 = np.minimum(y2[i], y2[order[1:]])
+        w = np.maximum(0.0, xx2 - xx1 + 1)
+        h = np.maximum(0.0, yy2 - yy1 + 1)
+        inter = w * h
+        ovr = inter / (areas[i] + areas[order[1:]] - inter)
+        inds = np.where(ovr <= nms_thr)[0]
+        order = order[inds + 1]
+    return keep
+def multiclass_nms(boxes, scores, nms_thr, score_thr):
+    """Multiclass NMS implemented in Numpy. Class-aware version."""
+    final_dets = []
+    num_classes = scores.shape[1]
+    for cls_ind in range(num_classes):
+        cls_scores = scores[:, cls_ind]
+        valid_score_mask = cls_scores > score_thr
+        if valid_score_mask.sum() == 0:
+            continue
+        else:
+            valid_scores = cls_scores[valid_score_mask]
+            valid_boxes = boxes[valid_score_mask]
+            keep = nms(valid_boxes, valid_scores, nms_thr)
+            if len(keep) > 0:
+                cls_inds = np.ones((len(keep), 1)) * cls_ind
+                dets = np.concatenate(
+                    [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
+                )
+                final_dets.append(dets)
+    if len(final_dets) == 0:
+        return None
+    return np.concatenate(final_dets, 0)
+def demo_postprocess(outputs, img_size, p6=False):
+    grids = []
+    expanded_strides = []
+    strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
+    hsizes = [img_size[0] // stride for stride in strides]
+    wsizes = [img_size[1] // stride for stride in strides]
+    for hsize, wsize, stride in zip(hsizes, wsizes, strides):
+        xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
+        grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
+        grids.append(grid)
+        shape = grid.shape[:2]
+        expanded_strides.append(np.full((*shape, 1), stride))
+    grids = np.concatenate(grids, 1)
+    expanded_strides = np.concatenate(expanded_strides, 1)
+    outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
+    outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
+    return outputs
+def preprocess(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    return padded_img, r
+def inference_detector(session, oriImg):
+    input_shape = (640,640)
+    img, ratio = preprocess(oriImg, input_shape)
+    ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
+    output = session.run(None, ort_inputs)
+    predictions = demo_postprocess(output[0], input_shape)[0]
+    boxes = predictions[:, :4]
+    scores = predictions[:, 4:5] * predictions[:, 5:]
+    boxes_xyxy = np.ones_like(boxes)
+    boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
+    boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
+    boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
+    boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
+    boxes_xyxy /= ratio
+    dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
+    if dets is not None:
+        final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
+        isscore = final_scores>0.3
+        iscat = final_cls_inds == 0
+        isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
+        final_boxes = final_boxes[isbbox]
+    else:
+        final_boxes = np.array([])
+    return final_boxes

vace/annotators/dwpose/onnxpose.py ADDED Viewed

	@@ -0,0 +1,362 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from typing import List, Tuple
+import cv2
+import numpy as np
+import onnxruntime as ort
+def preprocess(
+    img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
+) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
+    """Do preprocessing for RTMPose model inference.
+    Args:
+        img (np.ndarray): Input image in shape.
+        input_size (tuple): Input image size in shape (w, h).
+    Returns:
+        tuple:
+        - resized_img (np.ndarray): Preprocessed image.
+        - center (np.ndarray): Center of image.
+        - scale (np.ndarray): Scale of image.
+    """
+    # get shape of image
+    img_shape = img.shape[:2]
+    out_img, out_center, out_scale = [], [], []
+    if len(out_bbox) == 0:
+        out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
+    for i in range(len(out_bbox)):
+        x0 = out_bbox[i][0]
+        y0 = out_bbox[i][1]
+        x1 = out_bbox[i][2]
+        y1 = out_bbox[i][3]
+        bbox = np.array([x0, y0, x1, y1])
+        # get center and scale
+        center, scale = bbox_xyxy2cs(bbox, padding=1.25)
+        # do affine transformation
+        resized_img, scale = top_down_affine(input_size, scale, center, img)
+        # normalize image
+        mean = np.array([123.675, 116.28, 103.53])
+        std = np.array([58.395, 57.12, 57.375])
+        resized_img = (resized_img - mean) / std
+        out_img.append(resized_img)
+        out_center.append(center)
+        out_scale.append(scale)
+    return out_img, out_center, out_scale
+def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
+    """Inference RTMPose model.
+    Args:
+        sess (ort.InferenceSession): ONNXRuntime session.
+        img (np.ndarray): Input image in shape.
+    Returns:
+        outputs (np.ndarray): Output of RTMPose model.
+    """
+    all_out = []
+    # build input
+    for i in range(len(img)):
+        input = [img[i].transpose(2, 0, 1)]
+        # build output
+        sess_input = {sess.get_inputs()[0].name: input}
+        sess_output = []
+        for out in sess.get_outputs():
+            sess_output.append(out.name)
+        # run model
+        outputs = sess.run(sess_output, sess_input)
+        all_out.append(outputs)
+    return all_out
+def postprocess(outputs: List[np.ndarray],
+                model_input_size: Tuple[int, int],
+                center: Tuple[int, int],
+                scale: Tuple[int, int],
+                simcc_split_ratio: float = 2.0
+                ) -> Tuple[np.ndarray, np.ndarray]:
+    """Postprocess for RTMPose model output.
+    Args:
+        outputs (np.ndarray): Output of RTMPose model.
+        model_input_size (tuple): RTMPose model Input image size.
+        center (tuple): Center of bbox in shape (x, y).
+        scale (tuple): Scale of bbox in shape (w, h).
+        simcc_split_ratio (float): Split ratio of simcc.
+    Returns:
+        tuple:
+        - keypoints (np.ndarray): Rescaled keypoints.
+        - scores (np.ndarray): Model predict scores.
+    """
+    all_key = []
+    all_score = []
+    for i in range(len(outputs)):
+        # use simcc to decode
+        simcc_x, simcc_y = outputs[i]
+        keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
+        # rescale keypoints
+        keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
+        all_key.append(keypoints[0])
+        all_score.append(scores[0])
+    return np.array(all_key), np.array(all_score)
+def bbox_xyxy2cs(bbox: np.ndarray,
+                 padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
+    """Transform the bbox format from (x,y,w,h) into (center, scale)
+    Args:
+        bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
+            as (left, top, right, bottom)
+        padding (float): BBox padding factor that will be multilied to scale.
+            Default: 1.0
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
+            (n, 2)
+        - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
+            (n, 2)
+    """
+    # convert single bbox from (4, ) to (1, 4)
+    dim = bbox.ndim
+    if dim == 1:
+        bbox = bbox[None, :]
+    # get bbox center and scale
+    x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
+    center = np.hstack([x1 + x2, y1 + y2]) * 0.5
+    scale = np.hstack([x2 - x1, y2 - y1]) * padding
+    if dim == 1:
+        center = center[0]
+        scale = scale[0]
+    return center, scale
+def _fix_aspect_ratio(bbox_scale: np.ndarray,
+                      aspect_ratio: float) -> np.ndarray:
+    """Extend the scale to match the given aspect ratio.
+    Args:
+        scale (np.ndarray): The image scale (w, h) in shape (2, )
+        aspect_ratio (float): The ratio of ``w/h``
+    Returns:
+        np.ndarray: The reshaped image scale in (2, )
+    """
+    w, h = np.hsplit(bbox_scale, [1])
+    bbox_scale = np.where(w > h * aspect_ratio,
+                          np.hstack([w, w / aspect_ratio]),
+                          np.hstack([h * aspect_ratio, h]))
+    return bbox_scale
+def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
+    """Rotate a point by an angle.
+    Args:
+        pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
+        angle_rad (float): rotation angle in radian
+    Returns:
+        np.ndarray: Rotated point in shape (2, )
+    """
+    sn, cs = np.sin(angle_rad), np.cos(angle_rad)
+    rot_mat = np.array([[cs, -sn], [sn, cs]])
+    return rot_mat @ pt
+def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
+    """To calculate the affine matrix, three pairs of points are required. This
+    function is used to get the 3rd point, given 2D points a & b.
+    The 3rd point is defined by rotating vector `a - b` by 90 degrees
+    anticlockwise, using b as the rotation center.
+    Args:
+        a (np.ndarray): The 1st point (x,y) in shape (2, )
+        b (np.ndarray): The 2nd point (x,y) in shape (2, )
+    Returns:
+        np.ndarray: The 3rd point.
+    """
+    direction = a - b
+    c = b + np.r_[-direction[1], direction[0]]
+    return c
+def get_warp_matrix(center: np.ndarray,
+                    scale: np.ndarray,
+                    rot: float,
+                    output_size: Tuple[int, int],
+                    shift: Tuple[float, float] = (0., 0.),
+                    inv: bool = False) -> np.ndarray:
+    """Calculate the affine transformation matrix that can warp the bbox area
+    in the input image to the output size.
+    Args:
+        center (np.ndarray[2, ]): Center of the bounding box (x, y).
+        scale (np.ndarray[2, ]): Scale of the bounding box
+            wrt [width, height].
+        rot (float): Rotation angle (degree).
+        output_size (np.ndarray[2, ] | list(2,)): Size of the
+            destination heatmaps.
+        shift (0-100%): Shift translation ratio wrt the width/height.
+            Default (0., 0.).
+        inv (bool): Option to inverse the affine transform direction.
+            (inv=False: src->dst or inv=True: dst->src)
+    Returns:
+        np.ndarray: A 2x3 transformation matrix
+    """
+    shift = np.array(shift)
+    src_w = scale[0]
+    dst_w = output_size[0]
+    dst_h = output_size[1]
+    # compute transformation matrix
+    rot_rad = np.deg2rad(rot)
+    src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
+    dst_dir = np.array([0., dst_w * -0.5])
+    # get four corners of the src rectangle in the original image
+    src = np.zeros((3, 2), dtype=np.float32)
+    src[0, :] = center + scale * shift
+    src[1, :] = center + src_dir + scale * shift
+    src[2, :] = _get_3rd_point(src[0, :], src[1, :])
+    # get four corners of the dst rectangle in the input image
+    dst = np.zeros((3, 2), dtype=np.float32)
+    dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
+    dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
+    dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
+    if inv:
+        warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
+    else:
+        warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
+    return warp_mat
+def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
+                    img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get the bbox image as the model input by affine transform.
+    Args:
+        input_size (dict): The input size of the model.
+        bbox_scale (dict): The bbox scale of the img.
+        bbox_center (dict): The bbox center of the img.
+        img (np.ndarray): The original image.
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: img after affine transform.
+        - np.ndarray[float32]: bbox scale after affine transform.
+    """
+    w, h = input_size
+    warp_size = (int(w), int(h))
+    # reshape bbox to fixed aspect ratio
+    bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
+    # get the affine matrix
+    center = bbox_center
+    scale = bbox_scale
+    rot = 0
+    warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
+    # do affine transform
+    img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
+    return img, bbox_scale
+def get_simcc_maximum(simcc_x: np.ndarray,
+                      simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    """Get maximum response location and value from simcc representations.
+    Note:
+        instance number: N
+        num_keypoints: K
+        heatmap height: H
+        heatmap width: W
+    Args:
+        simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
+        simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
+    Returns:
+        tuple:
+        - locs (np.ndarray): locations of maximum heatmap responses in shape
+            (K, 2) or (N, K, 2)
+        - vals (np.ndarray): values of maximum heatmap responses in shape
+            (K,) or (N, K)
+    """
+    N, K, Wx = simcc_x.shape
+    simcc_x = simcc_x.reshape(N * K, -1)
+    simcc_y = simcc_y.reshape(N * K, -1)
+    # get maximum value locations
+    x_locs = np.argmax(simcc_x, axis=1)
+    y_locs = np.argmax(simcc_y, axis=1)
+    locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
+    max_val_x = np.amax(simcc_x, axis=1)
+    max_val_y = np.amax(simcc_y, axis=1)
+    # get maximum value across x and y axis
+    mask = max_val_x > max_val_y
+    max_val_x[mask] = max_val_y[mask]
+    vals = max_val_x
+    locs[vals <= 0.] = -1
+    # reshape
+    locs = locs.reshape(N, K, 2)
+    vals = vals.reshape(N, K)
+    return locs, vals
+def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
+           simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
+    """Modulate simcc distribution with Gaussian.
+    Args:
+        simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
+        simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
+        simcc_split_ratio (int): The split ratio of simcc.
+    Returns:
+        tuple: A tuple containing center and scale.
+        - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
+        - np.ndarray[float32]: scores in shape (K,) or (n, K)
+    """
+    keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
+    keypoints /= simcc_split_ratio
+    return keypoints, scores
+def inference_pose(session, out_bbox, oriImg):
+    h, w = session.get_inputs()[0].shape[2:]
+    model_input_size = (w, h)
+    resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
+    outputs = inference(session, resized_img)
+    keypoints, scores = postprocess(outputs, model_input_size, center, scale)
+    return keypoints, scores