fffiloni commited on
Commit
c7de15e
·
verified ·
1 Parent(s): 8c9dae5

Migrated from GitHub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +9 -0
  2. LICENSE.txt +201 -0
  3. ORIGINAL_README.md +189 -0
  4. UserGuide.md +160 -0
  5. __init__.py +0 -0
  6. assets/images/girl.png +3 -0
  7. assets/images/snake.png +3 -0
  8. assets/images/test.jpg +3 -0
  9. assets/images/test2.jpg +0 -0
  10. assets/images/test3.jpg +3 -0
  11. assets/masks/test.png +0 -0
  12. assets/masks/test2.png +0 -0
  13. assets/materials/gr_infer_demo.jpg +3 -0
  14. assets/materials/gr_pre_demo.jpg +3 -0
  15. assets/materials/tasks.png +3 -0
  16. assets/materials/teaser.jpg +3 -0
  17. assets/videos/test.mp4 +3 -0
  18. assets/videos/test2.mp4 +0 -0
  19. pyproject.toml +75 -0
  20. requirements.txt +1 -0
  21. requirements/annotator.txt +6 -0
  22. requirements/framework.txt +26 -0
  23. run_vace_ltx.sh +48 -0
  24. run_vace_pipeline.sh +27 -0
  25. run_vace_preproccess.sh +58 -0
  26. run_vace_wan.sh +48 -0
  27. tests/test_annotators.py +568 -0
  28. vace/__init__.py +6 -0
  29. vace/annotators/__init__.py +24 -0
  30. vace/annotators/canvas.py +60 -0
  31. vace/annotators/common.py +62 -0
  32. vace/annotators/composition.py +155 -0
  33. vace/annotators/depth.py +88 -0
  34. vace/annotators/depth_anything_v2/__init__.py +0 -0
  35. vace/annotators/depth_anything_v2/dinov2.py +414 -0
  36. vace/annotators/depth_anything_v2/dpt.py +210 -0
  37. vace/annotators/depth_anything_v2/layers/__init__.py +11 -0
  38. vace/annotators/depth_anything_v2/layers/attention.py +79 -0
  39. vace/annotators/depth_anything_v2/layers/block.py +252 -0
  40. vace/annotators/depth_anything_v2/layers/drop_path.py +34 -0
  41. vace/annotators/depth_anything_v2/layers/layer_scale.py +28 -0
  42. vace/annotators/depth_anything_v2/layers/mlp.py +39 -0
  43. vace/annotators/depth_anything_v2/layers/patch_embed.py +90 -0
  44. vace/annotators/depth_anything_v2/layers/swiglu_ffn.py +64 -0
  45. vace/annotators/depth_anything_v2/util/__init__.py +0 -0
  46. vace/annotators/depth_anything_v2/util/blocks.py +151 -0
  47. vace/annotators/depth_anything_v2/util/transform.py +159 -0
  48. vace/annotators/dwpose/__init__.py +2 -0
  49. vace/annotators/dwpose/onnxdet.py +127 -0
  50. vace/annotators/dwpose/onnxpose.py +362 -0
.gitattributes CHANGED
@@ -33,3 +33,12 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ assets/images/girl.png filter=lfs diff=lfs merge=lfs -text
37
+ assets/images/snake.png filter=lfs diff=lfs merge=lfs -text
38
+ assets/images/test.jpg filter=lfs diff=lfs merge=lfs -text
39
+ assets/images/test3.jpg filter=lfs diff=lfs merge=lfs -text
40
+ assets/materials/gr_infer_demo.jpg filter=lfs diff=lfs merge=lfs -text
41
+ assets/materials/gr_pre_demo.jpg filter=lfs diff=lfs merge=lfs -text
42
+ assets/materials/tasks.png filter=lfs diff=lfs merge=lfs -text
43
+ assets/materials/teaser.jpg filter=lfs diff=lfs merge=lfs -text
44
+ assets/videos/test.mp4 filter=lfs diff=lfs merge=lfs -text
LICENSE.txt ADDED
@@ -0,0 +1,201 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6
+
7
+ 1. Definitions.
8
+
9
+ "License" shall mean the terms and conditions for use, reproduction,
10
+ and distribution as defined by Sections 1 through 9 of this document.
11
+
12
+ "Licensor" shall mean the copyright owner or entity authorized by
13
+ the copyright owner that is granting the License.
14
+
15
+ "Legal Entity" shall mean the union of the acting entity and all
16
+ other entities that control, are controlled by, or are under common
17
+ control with that entity. For the purposes of this definition,
18
+ "control" means (i) the power, direct or indirect, to cause the
19
+ direction or management of such entity, whether by contract or
20
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
21
+ outstanding shares, or (iii) beneficial ownership of such entity.
22
+
23
+ "You" (or "Your") shall mean an individual or Legal Entity
24
+ exercising permissions granted by this License.
25
+
26
+ "Source" form shall mean the preferred form for making modifications,
27
+ including but not limited to software source code, documentation
28
+ source, and configuration files.
29
+
30
+ "Object" form shall mean any form resulting from mechanical
31
+ transformation or translation of a Source form, including but
32
+ not limited to compiled object code, generated documentation,
33
+ and conversions to other media types.
34
+
35
+ "Work" shall mean the work of authorship, whether in Source or
36
+ Object form, made available under the License, as indicated by a
37
+ copyright notice that is included in or attached to the work
38
+ (an example is provided in the Appendix below).
39
+
40
+ "Derivative Works" shall mean any work, whether in Source or Object
41
+ form, that is based on (or derived from) the Work and for which the
42
+ editorial revisions, annotations, elaborations, or other modifications
43
+ represent, as a whole, an original work of authorship. For the purposes
44
+ of this License, Derivative Works shall not include works that remain
45
+ separable from, or merely link (or bind by name) to the interfaces of,
46
+ the Work and Derivative Works thereof.
47
+
48
+ "Contribution" shall mean any work of authorship, including
49
+ the original version of the Work and any modifications or additions
50
+ to that Work or Derivative Works thereof, that is intentionally
51
+ submitted to Licensor for inclusion in the Work by the copyright owner
52
+ or by an individual or Legal Entity authorized to submit on behalf of
53
+ the copyright owner. For the purposes of this definition, "submitted"
54
+ means any form of electronic, verbal, or written communication sent
55
+ to the Licensor or its representatives, including but not limited to
56
+ communication on electronic mailing lists, source code control systems,
57
+ and issue tracking systems that are managed by, or on behalf of, the
58
+ Licensor for the purpose of discussing and improving the Work, but
59
+ excluding communication that is conspicuously marked or otherwise
60
+ designated in writing by the copyright owner as "Not a Contribution."
61
+
62
+ "Contributor" shall mean Licensor and any individual or Legal Entity
63
+ on behalf of whom a Contribution has been received by Licensor and
64
+ subsequently incorporated within the Work.
65
+
66
+ 2. Grant of Copyright License. Subject to the terms and conditions of
67
+ this License, each Contributor hereby grants to You a perpetual,
68
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69
+ copyright license to reproduce, prepare Derivative Works of,
70
+ publicly display, publicly perform, sublicense, and distribute the
71
+ Work and such Derivative Works in Source or Object form.
72
+
73
+ 3. Grant of Patent License. Subject to the terms and conditions of
74
+ this License, each Contributor hereby grants to You a perpetual,
75
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76
+ (except as stated in this section) patent license to make, have made,
77
+ use, offer to sell, sell, import, and otherwise transfer the Work,
78
+ where such license applies only to those patent claims licensable
79
+ by such Contributor that are necessarily infringed by their
80
+ Contribution(s) alone or by combination of their Contribution(s)
81
+ with the Work to which such Contribution(s) was submitted. If You
82
+ institute patent litigation against any entity (including a
83
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
84
+ or a Contribution incorporated within the Work constitutes direct
85
+ or contributory patent infringement, then any patent licenses
86
+ granted to You under this License for that Work shall terminate
87
+ as of the date such litigation is filed.
88
+
89
+ 4. Redistribution. You may reproduce and distribute copies of the
90
+ Work or Derivative Works thereof in any medium, with or without
91
+ modifications, and in Source or Object form, provided that You
92
+ meet the following conditions:
93
+
94
+ (a) You must give any other recipients of the Work or
95
+ Derivative Works a copy of this License; and
96
+
97
+ (b) You must cause any modified files to carry prominent notices
98
+ stating that You changed the files; and
99
+
100
+ (c) You must retain, in the Source form of any Derivative Works
101
+ that You distribute, all copyright, patent, trademark, and
102
+ attribution notices from the Source form of the Work,
103
+ excluding those notices that do not pertain to any part of
104
+ the Derivative Works; and
105
+
106
+ (d) If the Work includes a "NOTICE" text file as part of its
107
+ distribution, then any Derivative Works that You distribute must
108
+ include a readable copy of the attribution notices contained
109
+ within such NOTICE file, excluding those notices that do not
110
+ pertain to any part of the Derivative Works, in at least one
111
+ of the following places: within a NOTICE text file distributed
112
+ as part of the Derivative Works; within the Source form or
113
+ documentation, if provided along with the Derivative Works; or,
114
+ within a display generated by the Derivative Works, if and
115
+ wherever such third-party notices normally appear. The contents
116
+ of the NOTICE file are for informational purposes only and
117
+ do not modify the License. You may add Your own attribution
118
+ notices within Derivative Works that You distribute, alongside
119
+ or as an addendum to the NOTICE text from the Work, provided
120
+ that such additional attribution notices cannot be construed
121
+ as modifying the License.
122
+
123
+ You may add Your own copyright statement to Your modifications and
124
+ may provide additional or different license terms and conditions
125
+ for use, reproduction, or distribution of Your modifications, or
126
+ for any such Derivative Works as a whole, provided Your use,
127
+ reproduction, and distribution of the Work otherwise complies with
128
+ the conditions stated in this License.
129
+
130
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
131
+ any Contribution intentionally submitted for inclusion in the Work
132
+ by You to the Licensor shall be under the terms and conditions of
133
+ this License, without any additional terms or conditions.
134
+ Notwithstanding the above, nothing herein shall supersede or modify
135
+ the terms of any separate license agreement you may have executed
136
+ with Licensor regarding such Contributions.
137
+
138
+ 6. Trademarks. This License does not grant permission to use the trade
139
+ names, trademarks, service marks, or product names of the Licensor,
140
+ except as required for reasonable and customary use in describing the
141
+ origin of the Work and reproducing the content of the NOTICE file.
142
+
143
+ 7. Disclaimer of Warranty. Unless required by applicable law or
144
+ agreed to in writing, Licensor provides the Work (and each
145
+ Contributor provides its Contributions) on an "AS IS" BASIS,
146
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147
+ implied, including, without limitation, any warranties or conditions
148
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149
+ PARTICULAR PURPOSE. You are solely responsible for determining the
150
+ appropriateness of using or redistributing the Work and assume any
151
+ risks associated with Your exercise of permissions under this License.
152
+
153
+ 8. Limitation of Liability. In no event and under no legal theory,
154
+ whether in tort (including negligence), contract, or otherwise,
155
+ unless required by applicable law (such as deliberate and grossly
156
+ negligent acts) or agreed to in writing, shall any Contributor be
157
+ liable to You for damages, including any direct, indirect, special,
158
+ incidental, or consequential damages of any character arising as a
159
+ result of this License or out of the use or inability to use the
160
+ Work (including but not limited to damages for loss of goodwill,
161
+ work stoppage, computer failure or malfunction, or any and all
162
+ other commercial damages or losses), even if such Contributor
163
+ has been advised of the possibility of such damages.
164
+
165
+ 9. Accepting Warranty or Additional Liability. While redistributing
166
+ the Work or Derivative Works thereof, You may choose to offer,
167
+ and charge a fee for, acceptance of support, warranty, indemnity,
168
+ or other liability obligations and/or rights consistent with this
169
+ License. However, in accepting such obligations, You may act only
170
+ on Your own behalf and on Your sole responsibility, not on behalf
171
+ of any other Contributor, and only if You agree to indemnify,
172
+ defend, and hold each Contributor harmless for any liability
173
+ incurred by, or claims asserted against, such Contributor by reason
174
+ of your accepting any such warranty or additional liability.
175
+
176
+ END OF TERMS AND CONDITIONS
177
+
178
+ APPENDIX: How to apply the Apache License to your work.
179
+
180
+ To apply the Apache License to your work, attach the following
181
+ boilerplate notice, with the fields enclosed by brackets "[]"
182
+ replaced with your own identifying information. (Don't include
183
+ the brackets!) The text should be enclosed in the appropriate
184
+ comment syntax for the file format. We also recommend that a
185
+ file or class name and description of purpose be included on the
186
+ same "printed page" as the copyright notice for easier
187
+ identification within third-party archives.
188
+
189
+ Copyright [yyyy] [name of copyright owner]
190
+
191
+ Licensed under the Apache License, Version 2.0 (the "License");
192
+ you may not use this file except in compliance with the License.
193
+ You may obtain a copy of the License at
194
+
195
+ http://www.apache.org/licenses/LICENSE-2.0
196
+
197
+ Unless required by applicable law or agreed to in writing, software
198
+ distributed under the License is distributed on an "AS IS" BASIS,
199
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200
+ See the License for the specific language governing permissions and
201
+ limitations under the License.
ORIGINAL_README.md ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <p align="center">
2
+
3
+ <h1 align="center">VACE: All-in-One Video Creation and Editing</h1>
4
+ <p align="center">
5
+ <strong>Zeyinzi Jiang<sup>*</sup></strong>
6
+ ·
7
+ <strong>Zhen Han<sup>*</sup></strong>
8
+ ·
9
+ <strong>Chaojie Mao<sup>*&dagger;</sup></strong>
10
+ ·
11
+ <strong>Jingfeng Zhang</strong>
12
+ ·
13
+ <strong>Yulin Pan</strong>
14
+ ·
15
+ <strong>Yu Liu</strong>
16
+ <br>
17
+ <b>Tongyi Lab - <a href="https://github.com/Wan-Video/Wan2.1"><img src='https://ali-vilab.github.io/VACE-Page/assets/logos/wan_logo.png' alt='wan_logo' style='margin-bottom: -4px; height: 20px;'></a> </b>
18
+ <br>
19
+ <br>
20
+ <a href="https://arxiv.org/abs/2503.07598"><img src='https://img.shields.io/badge/VACE-arXiv-red' alt='Paper PDF'></a>
21
+ <a href="https://ali-vilab.github.io/VACE-Page/"><img src='https://img.shields.io/badge/VACE-Project_Page-green' alt='Project Page'></a>
22
+ <a href="https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38"><img src='https://img.shields.io/badge/VACE-HuggingFace_Model-yellow'></a>
23
+ <a href="https://modelscope.cn/collections/VACE-8fa5fcfd386e43"><img src='https://img.shields.io/badge/VACE-ModelScope_Model-purple'></a>
24
+ <br>
25
+ </p>
26
+
27
+
28
+ ## Introduction
29
+ <strong>VACE</strong> is an all-in-one model designed for video creation and editing. It encompasses various tasks, including reference-to-video generation (<strong>R2V</strong>), video-to-video editing (<strong>V2V</strong>), and masked video-to-video editing (<strong>MV2V</strong>), allowing users to compose these tasks freely. This functionality enables users to explore diverse possibilities and streamlines their workflows effectively, offering a range of capabilities, such as Move-Anything, Swap-Anything, Reference-Anything, Expand-Anything, Animate-Anything, and more.
30
+
31
+ <img src='./assets/materials/teaser.jpg'>
32
+
33
+
34
+ ## 🎉 News
35
+ - [x] May 14, 2025: 🔥Wan2.1-VACE-1.3B and Wan2.1-VACE-14B models are now available at [HuggingFace](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) and [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B)!
36
+ - [x] Mar 31, 2025: 🔥VACE-Wan2.1-1.3B-Preview and VACE-LTX-Video-0.9 models are now available at [HuggingFace](https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38) and [ModelScope](https://modelscope.cn/collections/VACE-8fa5fcfd386e43)!
37
+ - [x] Mar 31, 2025: 🔥Release code of model inference, preprocessing, and gradio demos.
38
+ - [x] Mar 11, 2025: We propose [VACE](https://ali-vilab.github.io/VACE-Page/), an all-in-one model for video creation and editing.
39
+
40
+
41
+ ## 🪄 Models
42
+ | Models | Download Link | Video Size | License |
43
+ |--------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------|-------------------|-----------------------------------------------------------------------------------------------|
44
+ | VACE-Wan2.1-1.3B-Preview | [Huggingface](https://huggingface.co/ali-vilab/VACE-Wan2.1-1.3B-Preview) 🤗 [ModelScope](https://modelscope.cn/models/iic/VACE-Wan2.1-1.3B-Preview) 🤖 | ~ 81 x 480 x 832 | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/blob/main/LICENSE.txt) |
45
+ | VACE-LTX-Video-0.9 | [Huggingface](https://huggingface.co/ali-vilab/VACE-LTX-Video-0.9) 🤗 [ModelScope](https://modelscope.cn/models/iic/VACE-LTX-Video-0.9) 🤖 | ~ 97 x 512 x 768 | [RAIL-M](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltx-video-2b-v0.9.license.txt) |
46
+ | Wan2.1-VACE-1.3B | [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-VACE-1.3B) 🤗 [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-1.3B) 🤖 | ~ 81 x 480 x 832 | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-1.3B/blob/main/LICENSE.txt) |
47
+ | Wan2.1-VACE-14B | [Huggingface](https://huggingface.co/Wan-AI/Wan2.1-VACE-14B) 🤗 [ModelScope](https://www.modelscope.cn/models/Wan-AI/Wan2.1-VACE-14B) 🤖 | ~ 81 x 720 x 1280 | [Apache-2.0](https://huggingface.co/Wan-AI/Wan2.1-T2V-14B/blob/main/LICENSE.txt) |
48
+
49
+ - The input supports any resolution, but to achieve optimal results, the video size should fall within a specific range.
50
+ - All models inherit the license of the original model.
51
+
52
+
53
+ ## ⚙️ Installation
54
+ The codebase was tested with Python 3.10.13, CUDA version 12.4, and PyTorch >= 2.5.1.
55
+
56
+ ### Setup for Model Inference
57
+ You can setup for VACE model inference by running:
58
+ ```bash
59
+ git clone https://github.com/ali-vilab/VACE.git && cd VACE
60
+ pip install torch==2.5.1 torchvision==0.20.1 --index-url https://download.pytorch.org/whl/cu124 # If PyTorch is not installed.
61
+ pip install -r requirements.txt
62
+ pip install wan@git+https://github.com/Wan-Video/Wan2.1 # If you want to use Wan2.1-based VACE.
63
+ pip install ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1 sentencepiece --no-deps # If you want to use LTX-Video-0.9-based VACE. It may conflict with Wan.
64
+ ```
65
+ Please download your preferred base model to `<repo-root>/models/`.
66
+
67
+ ### Setup for Preprocess Tools
68
+ If you need preprocessing tools, please install:
69
+ ```bash
70
+ pip install -r requirements/annotator.txt
71
+ ```
72
+ Please download [VACE-Annotators](https://huggingface.co/ali-vilab/VACE-Annotators) to `<repo-root>/models/`.
73
+
74
+ ### Local Directories Setup
75
+ It is recommended to download [VACE-Benchmark](https://huggingface.co/datasets/ali-vilab/VACE-Benchmark) to `<repo-root>/benchmarks/` as examples in `run_vace_xxx.sh`.
76
+
77
+ We recommend to organize local directories as:
78
+ ```angular2html
79
+ VACE
80
+ ├── ...
81
+ ├── benchmarks
82
+ │ └── VACE-Benchmark
83
+ │ └── assets
84
+ │ └── examples
85
+ │ ├── animate_anything
86
+ │ │ └── ...
87
+ │ └── ...
88
+ ├── models
89
+ │ ├── VACE-Annotators
90
+ │ │ └── ...
91
+ │ ├── VACE-LTX-Video-0.9
92
+ │ │ └── ...
93
+ │ └── VACE-Wan2.1-1.3B-Preview
94
+ │ └── ...
95
+ └── ...
96
+ ```
97
+
98
+ ## 🚀 Usage
99
+ In VACE, users can input **text prompt** and optional **video**, **mask**, and **image** for video generation or editing.
100
+ Detailed instructions for using VACE can be found in the [User Guide](./UserGuide.md).
101
+
102
+ ### Inference CIL
103
+ #### 1) End-to-End Running
104
+ To simply run VACE without diving into any implementation details, we suggest an end-to-end pipeline. For example:
105
+ ```bash
106
+ # run V2V depth
107
+ python vace/vace_pipeline.py --base wan --task depth --video assets/videos/test.mp4 --prompt 'xxx'
108
+
109
+ # run MV2V inpainting by providing bbox
110
+ python vace/vace_pipeline.py --base wan --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4 --prompt 'xxx'
111
+ ```
112
+ This script will run video preprocessing and model inference sequentially,
113
+ and you need to specify all the required args of preprocessing (`--task`, `--mode`, `--bbox`, `--video`, etc.) and inference (`--prompt`, etc.).
114
+ The output video together with intermediate video, mask and images will be saved into `./results/` by default.
115
+
116
+ > 💡**Note**:
117
+ > Please refer to [run_vace_pipeline.sh](./run_vace_pipeline.sh) for usage examples of different task pipelines.
118
+
119
+
120
+ #### 2) Preprocessing
121
+ To have more flexible control over the input, before VACE model inference, user inputs need to be preprocessed into `src_video`, `src_mask`, and `src_ref_images` first.
122
+ We assign each [preprocessor](./vace/configs/__init__.py) a task name, so simply call [`vace_preprocess.py`](./vace/vace_preproccess.py) and specify the task name and task params. For example:
123
+ ```angular2html
124
+ # process video depth
125
+ python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
126
+
127
+ # process video inpainting by providing bbox
128
+ python vace/vace_preproccess.py --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4
129
+ ```
130
+ The outputs will be saved to `./processed/` by default.
131
+
132
+ > 💡**Note**:
133
+ > Please refer to [run_vace_pipeline.sh](./run_vace_pipeline.sh) preprocessing methods for different tasks.
134
+ Moreover, refer to [vace/configs/](./vace/configs/) for all the pre-defined tasks and required params.
135
+ You can also customize preprocessors by implementing at [`annotators`](./vace/annotators/__init__.py) and register them at [`configs`](./vace/configs).
136
+
137
+
138
+ #### 3) Model inference
139
+ Using the input data obtained from **Preprocessing**, the model inference process can be performed as follows:
140
+ ```bash
141
+ # For Wan2.1 single GPU inference (1.3B-480P)
142
+ python vace/vace_wan_inference.py --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
143
+
144
+ # For Wan2.1 Multi GPU Acceleration inference (1.3B-480P)
145
+ pip install "xfuser>=0.4.1"
146
+ torchrun --nproc_per_node=8 vace/vace_wan_inference.py --dit_fsdp --t5_fsdp --ulysses_size 1 --ring_size 8 --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
147
+
148
+ # For Wan2.1 Multi GPU Acceleration inference (14B-720P)
149
+ torchrun --nproc_per_node=8 vace/vace_wan_inference.py --dit_fsdp --t5_fsdp --ulysses_size 8 --ring_size 1 --size 720p --model_name 'vace-14B' --ckpt_dir <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
150
+
151
+ # For LTX inference, run
152
+ python vace/vace_ltx_inference.py --ckpt_path <path-to-model> --text_encoder_path <path-to-model> --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt "xxx"
153
+ ```
154
+ The output video together with intermediate video, mask and images will be saved into `./results/` by default.
155
+
156
+ > 💡**Note**:
157
+ > (1) Please refer to [vace/vace_wan_inference.py](./vace/vace_wan_inference.py) and [vace/vace_ltx_inference.py](./vace/vace_ltx_inference.py) for the inference args.
158
+ > (2) For LTX-Video and English language Wan2.1 users, you need prompt extension to unlock the full model performance.
159
+ Please follow the [instruction of Wan2.1](https://github.com/Wan-Video/Wan2.1?tab=readme-ov-file#2-using-prompt-extension) and set `--use_prompt_extend` while running inference.
160
+ > (3) When performing prompt extension in editing tasks, it's important to pay attention to the results of expanding plain text. Since the visual information being input is unknown, this may lead to the extended output not matching the video being edited, which can affect the final outcome.
161
+
162
+ ### Inference Gradio
163
+ For preprocessors, run
164
+ ```bash
165
+ python vace/gradios/vace_preprocess_demo.py
166
+ ```
167
+ For model inference, run
168
+ ```bash
169
+ # For Wan2.1 gradio inference
170
+ python vace/gradios/vace_wan_demo.py
171
+
172
+ # For LTX gradio inference
173
+ python vace/gradios/vace_ltx_demo.py
174
+ ```
175
+
176
+ ## Acknowledgement
177
+
178
+ We are grateful for the following awesome projects, including [Scepter](https://github.com/modelscope/scepter), [Wan](https://github.com/Wan-Video/Wan2.1), and [LTX-Video](https://github.com/Lightricks/LTX-Video).
179
+
180
+
181
+ ## BibTeX
182
+
183
+ ```bibtex
184
+ @article{vace,
185
+ title = {VACE: All-in-One Video Creation and Editing},
186
+ author = {Jiang, Zeyinzi and Han, Zhen and Mao, Chaojie and Zhang, Jingfeng and Pan, Yulin and Liu, Yu},
187
+ journal = {arXiv preprint arXiv:2503.07598},
188
+ year = {2025}
189
+ }
UserGuide.md ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VACE User Guide
2
+
3
+ ## 1. Overall Steps
4
+
5
+ - Preparation: Be aware of the task type ([single task](#32-single-task) or [multi-task composition](#33-composition-task)) of your creative idea, and prepare all the required materials (images, videos, prompt, etc.)
6
+ - Preprocessing: Select the appropriate preprocessing method based task name, then preprocess your materials to meet the model's input requirements.
7
+ - Inference: Based on the preprocessed materials, perform VACE inference to obtain results.
8
+
9
+ ## 2. Preparations
10
+
11
+ ### 2.1 Task Definition
12
+
13
+ VACE, as a unified video generation solution, simultaneously supports Video Generation, Video Editing, and complex composition task. Specifically:
14
+
15
+ - Video Generation: No video input. Injecting concepts into the model through semantic understanding based on text and reference materials, including **T2V** (Text-to-Video Generation) and **R2V** (Reference-to-Video Generation) tasks.
16
+ - Video Editing: With video input. Modifying input video at the pixel level globally or locally,including **V2V** (Video-to-Video Editing) and **MV2V** (Masked Video-to-Video Editing).
17
+ - Composition Task: Compose two or more single task above into a complex composition task, such as **Reference Anything** (Face R2V + Object R2V), **Move Anything**(Frame R2V + Layout V2V), **Animate Anything**(R2V + Pose V2V), **Swap Anything**(R2V + Inpainting MV2V), and **Expand Anything**(Object R2V + Frame R2V + Outpainting MV2V), etc.
18
+
19
+ Single tasks and compositional tasks are illustrated in the diagram below:
20
+
21
+ ![vace_task](assets/materials/tasks.png)
22
+
23
+
24
+ ### 2.2 Limitations
25
+
26
+ - Super high resolution video will be resized to proper spatial size.
27
+ - Super long video will be trimmed or uniformly sampled into around 5 seconds.
28
+ - For users who are demanding of long video generation, we recommend to generate 5s video clips one by one, while using `firstclip` video extension task to keep the temporal consistency.
29
+
30
+ ## 3. Preprocessing
31
+ ### 3.1 VACE-Recognizable Inputs
32
+
33
+ User-collected materials needs to be preprocessed into VACE-recognizable inputs, including **`src_video`**, **`src_mask`**, **`src_ref_images`**, and **`prompt`**.
34
+ Specific descriptions are as follows:
35
+
36
+ - `src_video`: The video to be edited for input into the model, such as condition videos (Depth, Pose, etc.) or in/outpainting input video. **Gray areas**(values equal to 127) represent missing video part. In first-frame R2V task, the first frame are reference frame while subsequent frames are left gray. The missing parts of in/outpainting `src_video` are also set gray.
37
+ - `src_mask`: A 3D mask in the same shape of `src_video`. **White areas** represent the parts to be generated, while **black areas** represent the parts to be retained.
38
+ - `src_ref_images`: Reference images of R2V. Salient object segmentation can be performed to keep the background white.
39
+ - `prompt`: A text describing the content of the output video. Prompt expansion can be used to achieve better generation effects for LTX-Video and English user of Wan2.1. Use descriptive prompt instead of instructions.
40
+
41
+ Among them, `prompt` is required while `src_video`, `src_mask`, and `src_ref_images` are optional. For instance, MV2V task requires `src_video`, `src_mask`, and `prompt`; R2V task only requires `src_ref_images` and `prompt`.
42
+
43
+ ### 3.2 Preprocessing Tools
44
+ Both command line and Gradio demo are supported.
45
+
46
+ 1) Command Line: You can refer to the `run_vace_preproccess.sh` script and invoke it based on the different task types. An example command is as follows:
47
+ ```bash
48
+ python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
49
+ ```
50
+
51
+ 2) Gradio Interactive: Launch the graphical interface for data preprocessing and perform preprocessing on the interface. The specific command is as follows:
52
+ ```bash
53
+ python vace/gradios/preprocess_demo.py
54
+ ```
55
+
56
+ ![gr_pre_demo](assets/materials/gr_pre_demo.jpg)
57
+
58
+
59
+ ### 3.2 Single Tasks
60
+
61
+ VACE is an all-in-one model supporting various task types. However, different preprocessing is required for these task types. The specific task types and descriptions are as follows:
62
+
63
+ | Task | Subtask | Annotator | Input modal | Params | Note |
64
+ |------------|----------------------|----------------------------|------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------|
65
+ | txt2vid | txt2vid | / | / | / | |
66
+ | control | depth | DepthVideoAnnotator | video | / | |
67
+ | control | flow | FlowVisAnnotator | video | / | |
68
+ | control | gray | GrayVideoAnnotator | video | / | |
69
+ | control | pose | PoseBodyFaceVideoAnnotator | video | / | |
70
+ | control | scribble | ScribbleVideoAnnotator | video | / | |
71
+ | control | layout_bbox | LayoutBboxAnnotator | two bboxes <br>'x1,y1,x2,y2 x1,y1,x2,y2' | / | Move linearly from the first box to the second box |
72
+ | control | layout_track | LayoutTrackAnnotator | video | mode='masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0 | Mode represents different methods of subject tracking. |
73
+ | extension | frameref | FrameRefExpandAnnotator | image | mode='firstframe'<br>expand_num=80 (default) | |
74
+ | extension | frameref | FrameRefExpandAnnotator | image | mode='lastframe'<br>expand_num=80 (default) | |
75
+ | extension | frameref | FrameRefExpandAnnotator | two images<br>a.jpg,b.jpg | mode='firstlastframe'<br>expand_num=80 (default) | Images are separated by commas. |
76
+ | extension | clipref | FrameRefExpandAnnotator | video | mode='firstclip'<br>expand_num=80 (default) | |
77
+ | extension | clipref | FrameRefExpandAnnotator | video | mode='lastclip'<br>expand_num=80 (default) | |
78
+ | extension | clipref | FrameRefExpandAnnotator | two videos<br>a.mp4,b.mp4 | mode='firstlastclip'<br>expand_num=80 (default) | Videos are separated by commas. |
79
+ | repainting | inpainting_mask | InpaintingAnnotator | video | mode='salient' | Use salient as a fixed mask. |
80
+ | repainting | inpainting_mask | InpaintingAnnotator | video + mask | mode='mask' | Use mask as a fixed mask. |
81
+ | repainting | inpainting_bbox | InpaintingAnnotator | video + bbox<br>'x1, y1, x2, y2' | mode='bbox' | Use bbox as a fixed mask. |
82
+ | repainting | inpainting_masktrack | InpaintingAnnotator | video | mode='salientmasktrack' | Use salient mask for dynamic tracking. |
83
+ | repainting | inpainting_masktrack | InpaintingAnnotator | video | mode='salientbboxtrack' | Use salient bbox for dynamic tracking. |
84
+ | repainting | inpainting_masktrack | InpaintingAnnotator | video + mask | mode='masktrack' | Use mask for dynamic tracking. |
85
+ | repainting | inpainting_bboxtrack | InpaintingAnnotator | video + bbox<br>'x1, y1, x2, y2' | mode='bboxtrack' | Use bbox for dynamic tracking. |
86
+ | repainting | inpainting_label | InpaintingAnnotator | video + label | mode='label' | Use label for dynamic tracking. |
87
+ | repainting | inpainting_caption | InpaintingAnnotator | video + caption | mode='caption' | Use caption for dynamic tracking. |
88
+ | repainting | outpainting | OutpaintingVideoAnnotator | video | direction=left/right/up/down<br>expand_ratio=0~1.0 | Combine outpainting directions arbitrarily. |
89
+ | reference | image_reference | SubjectAnnotator | image | mode='salient/mask/bbox/salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0 | Use different methods to obtain the subject region. |
90
+
91
+ ### 3.3 Composition Task
92
+
93
+ Moreover, VACE supports combining tasks to accomplish more complex objectives. The following examples illustrate how tasks can be combined, but these combinations are not limited to the examples provided:
94
+
95
+ | Task | Subtask | Annotator | Input modal | Params | Note |
96
+ |-------------|--------------------|----------------------------|--------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------|
97
+ | composition | reference_anything | ReferenceAnythingAnnotator | image_list | mode='salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption' | Input no more than three images. |
98
+ | composition | animate_anything | AnimateAnythingAnnotator | image + video | mode='salientmasktrack/salientbboxtrack/masktrack/bboxtrack/label/caption' | Video for conditional redrawing; images for reference generation. |
99
+ | composition | swap_anything | SwapAnythingAnnotator | image + video | mode='masktrack/bboxtrack/label/caption'<br>maskaug_mode(optional)='original/original_expand/hull/hull_expand/bbox/bbox_expand'<br>maskaug_ratio(optional)=0~1.0 | Video for conditional redrawing; images for reference generation.<br>Comma-separated mode: first for video, second for images. |
100
+ | composition | expand_anything | ExpandAnythingAnnotator | image + image_list | mode='masktrack/bboxtrack/label/caption'<br>direction=left/right/up/down<br>expand_ratio=0~1.0<br>expand_num=80 (default) | First image for extension edit; others for reference.<br>Comma-separated mode: first for video, second for images. |
101
+ | composition | move_anything | MoveAnythingAnnotator | image + two bboxes | expand_num=80 (default) | First image for initial frame reference; others represented by linear bbox changes. |
102
+ | composition | more_anything | ... | ... | ... | ... |
103
+
104
+
105
+ ## 4. Model Inference
106
+
107
+ ### 4.1 Execution Methods
108
+
109
+ Both command line and Gradio demo are supported.
110
+
111
+ 1) Command Line: Refer to the `run_vace_ltx.sh` and `run_vace_wan.sh` scripts and invoke them based on the different task types. The input data needs to be preprocessed to obtain parameters such as `src_video`, `src_mask`, `src_ref_images` and `prompt`. An example command is as follows:
112
+ ```bash
113
+ python vace/vace_wan_inference.py --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt <prompt> # wan
114
+ python vace/vace_ltx_inference.py --src_video <path-to-src-video> --src_mask <path-to-src-mask> --src_ref_images <paths-to-src-ref-images> --prompt <prompt> # ltx
115
+ ```
116
+
117
+ 2) Gradio Interactive: Launch the graphical interface for model inference and perform inference through interactions on the interface. The specific command is as follows:
118
+ ```bash
119
+ python vace/gradios/vace_wan_demo.py # wan
120
+ python vace/gradios/vace_ltx_demo.py # ltx
121
+ ```
122
+
123
+ ![gr_infer_demo](assets/materials/gr_infer_demo.jpg)
124
+
125
+ 3) End-to-End Inference: Refer to the `run_vace_pipeline.sh` script and invoke it based on different task types and input data. This pipeline includes both preprocessing and model inference, thereby requiring only user-provided materials. However, it offers relatively less flexibility. An example command is as follows:
126
+ ```bash
127
+ python vace/vace_pipeline.py --base wan --task depth --video <path-to-video> --prompt <prompt> # wan
128
+ python vace/vace_pipeline.py --base lxt --task depth --video <path-to-video> --prompt <prompt> # ltx
129
+ ```
130
+
131
+ ### 4.2 Inference Examples
132
+
133
+ We provide test examples under different tasks, enabling users to validate according to their needs. These include **task**, **sub-tasks**, **original inputs** (ori_videos and ori_images), **model inputs** (src_video, src_mask, src_ref_images, prompt), and **model outputs**.
134
+
135
+ | task | subtask | src_video | src_mask | src_ref_images | out_video | prompt | ori_video | ori_images |
136
+ |-------------|--------------------|----------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
137
+ | txt2vid | txt2vid | | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/txt2vid/out_video.mp4"></video> | 狂风巨浪的大海,镜头缓缓推进,一艘渺小的帆船在汹涌的波涛中挣扎漂荡。海面上白沫翻滚,帆船时隐时现,仿佛随时可能被巨浪吞噬。天空乌云密布,雷声轰鸣,海鸥在空中盘旋尖叫。帆船上的人们紧紧抓住缆绳,努力保持平衡。画面风格写实,充满紧张和动感。近景特写,强调风浪的冲击力和帆船的摇晃 | | |
138
+ | extension | firstframe | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4"></video> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4"></video> | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/out_video.mp4"></video> | 纪实摄影风格,前景是一位中国越野爱好者坐在越野车上,手持车载电台正在进行通联。他五官清晰,表情专注,眼神坚定地望向前方。越野车停在户外,车身略显脏污,显示出经历过的艰难路况。镜头从车外缓缓拉近,最后定格在人物的面部特写上,展现出他的坚定与热情。中景到近景,动态镜头运镜。 | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/firstframe/ori_image_1.png"> |
139
+ | repainting | inpainting | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4"></video> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4"></video> | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/out_video.mp4"></video> | 一只巨大的金色凤凰从繁华的城市上空展翅飞过,羽毛如火焰般璀璨,闪烁着温暖的光辉,翅膀雄伟地展开。凤凰高昂着头,目光炯炯,轻轻扇动翅膀,散发出淡淡的光芒。下方是熙熙攘攘的市中心,人群惊叹,车水马龙,红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道,捕捉这一壮丽的景象,营造出既神秘又辉煌的氛围。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/inpainting/ori_video.mp4"></video> | |
140
+ | repainting | outpainting | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4"></video> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4"></video> | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/out_video.mp4"></video> | 赛博朋克风格,无人机俯瞰视角下的现代西安城墙,镜头穿过永宁门时泛起金色涟漪,城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起,现代与古代的交融,城墙上的灯光闪烁,形成时空隧道的效果。全息投影技术展现历史变迁,粒子重组特效细腻逼真。大远景逐渐过渡到特写,聚焦于城门特效。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/outpainting/ori_video.mp4"></video> | |
141
+ | control | depth | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/out_video.mp4"></video> | 一群年轻人在天空之城拍摄集体照。画面中,一对年轻情侣手牵手,轻声细语,相视而笑,周围是飞翔的彩色热气球和闪烁的星星,营造出浪漫的氛围。天空中,暖阳透过飘浮的云朵,洒下斑驳的光影。镜头以近景特写开始,随着情侣间的亲密互动,缓缓拉远。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/depth/ori_video.mp4"></video> | |
142
+ | control | flow | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/out_video.mp4"></video> | 纪实摄影风格,一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中,溅起晶莹的水花。画面以慢镜头捕捉这一瞬间,水花在空中绽放,形成美丽的弧线。玻璃杯中的牛奶纯白,番茄的鲜红与之形成鲜明对比。背景简洁,突出主体。近景特写,垂直俯视视角,展现细节之美。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/flow/ori_video.mp4"></video> | |
143
+ | control | gray | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/out_video.mp4"></video> | 镜头缓缓向右平移,身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬,眼神明亮而充满活力。背景是秋天红色和黄色的树叶,阳光透过树叶的缝隙洒下斑驳光影,营造出温馨自然的氛围。画面风格清新自然,仿佛夏日午后的一抹清凉。中景人像,强调自然光效和细腻的皮肤质感。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/gray/ori_video.mp4"></video> | |
144
+ | control | pose | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/out_video.mp4"></video> | 在一个热带的庆祝派对上,一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈,年轻人兴奋地举杯碰撞,孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光,营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间,温暖的阳光映照着他们幸福的面庞。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/pose/ori_video.mp4"></video> | |
145
+ | control | scribble | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/out_video.mp4"></video> | 画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙,尘埃反射着阳光。镜头快速切换至城墙上的砖石特写,阳光温暖地洒落,勾勒出每一块砖块的细腻纹理。整体画质清晰华丽,运镜流畅如水。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/scribble/ori_video.mp4"></video> | |
146
+ | control | layout | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4"></video> | | | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/out_video.mp4"></video> | 视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中,幼鸟张开嘴巴等待食物。随后,成鸟飞走,幼鸟继续等待。成鸟再次飞回,带回食物喂养幼鸟。整个视频的拍摄角度固定,聚焦于巢穴和鸟类的互动,背景是模糊的绿色植被,强调了鸟类的自然行为和生态环境。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/layout/ori_video.mp4"></video> | |
147
+ | reference | face | | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/face/out_video.mp4"></video> | 视频展示了一位长着尖耳朵的老人,他有一头银白色的长发和小胡子,穿着一件色彩斑斓的长袍,内搭金色衬衫,散发出神秘与智慧的气息。背景为一个华丽宫殿的内部,金碧辉煌。灯光明亮,照亮他脸上的神采奕奕。摄像机旋转动态拍摄,捕捉老人轻松挥手的动作。 | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/face/ori_image_1.png"> |
148
+ | reference | object | | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/object/out_video.mp4"></video> | 经典游戏角色马里奥在绿松石色水下世界中,四周环绕着珊瑚和各种各样的热带鱼。马里奥兴奋地向上跳起,摆出经典的欢快姿势,身穿鲜明的蓝色潜水服,红色的潜水面���上印有“M”标志,脚上是一双潜水靴。背景中,水泡随波逐流,浮现出一个巨大而友好的海星。摄像机从水底向上快速移动,捕捉他跃出水面的瞬间,灯光明亮而流动。该场景融合了动画与幻想元素,令人惊叹。 | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/object/ori_image_1.png"> |
149
+ | composition | reference_anything | | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png">,<img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/out_video.mp4"></video> | 一名打扮成超人的男子自信地站着,面对镜头,肩头有一只充满活力的毛绒黄色鸭子。他留着整齐的短发和浅色胡须,鸭子有橙色的喙和脚,它的翅膀稍微展开,脚分开以保持稳定。他的表情严肃而坚定。他穿着标志性的蓝红超人服装,胸前有黄色“S”标志。斗篷在他身后飘逸。背景有行人。相机位于视线水平,捕捉角色的整个上半身。灯光均匀明亮。 | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/ori_image_1.png">,<img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/reference_anything/ori_image_2.png"> |
150
+ | composition | swap_anything | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4"></video> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4"></video> | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/out_video.mp4"></video> | 视频展示了一个人在宽阔的草原上骑马。他有淡紫色长发,穿着传统服饰白上衣黑裤子,动画建模画风,看起来像是在进行某种户外活动或者是在进行某种表演。背景是壮观的山脉和多云的天空,给人一种宁静而广阔的感觉。整个视频的拍摄角度是固定的,重点展示了骑手和他的马。 | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/ori_video.mp4"></video> | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/swap_anything/ori_image_1.jpg"> |
151
+ | composition | expand_anything | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4"></video> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4"></video> | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png"> | <video controls height="200" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/out_video.mp4"></video> | 古典油画风格,背景是一条河边,画面中央一位成熟优雅的女人,穿着长裙坐在椅子上。她双手从怀里取出打开的红色心形墨镜戴上。固定机位。 | | <img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/ori_image_1.jpeg">,<img style="width: auto; height: 200px; object-fit: contain;" src="benchmarks/VACE-Benchmark/assets/examples/expand_anything/ori_image_2.png"> |
152
+
153
+ ## 5. Limitations
154
+
155
+ - VACE-LTX-Video-0.9
156
+ - The prompt significantly impacts video generation quality on LTX-Video. It must be extended in accordance with the methods described in this [system prompt](https://huggingface.co/spaces/Lightricks/LTX-Video-Playground/blob/main/assets/system_prompt_i2v.txt). We also provide input parameters for using prompt extension (--use_prompt_extend).
157
+ - This model is intended for experimental research validation within the VACE paper and may not guarantee performance in real-world scenarios. However, its inference speed is very fast, capable of creating a video in 25 seconds with 40 steps on an A100 GPU, making it suitable for preliminary data and creative validation.
158
+ - VACE-Wan2.1-1.3B-Preview
159
+ - This model mainly keeps the original Wan2.1-T2V-1.3B's video quality while supporting various tasks.
160
+ - When you encounter failure cases with specific tasks, we recommend trying again with a different seed and adjusting the prompt.
__init__.py ADDED
File without changes
assets/images/girl.png ADDED

Git LFS Details

  • SHA256: f461a83c0772dbe93a05ae6b8ce9fa77f0e7f5facb4402685b5410c0dc18397f
  • Pointer size: 131 Bytes
  • Size of remote file: 836 kB
assets/images/snake.png ADDED

Git LFS Details

  • SHA256: 60ae5e275f64de6ca99c5e63eaea6812fe09a6d7e7a233e483e700122ad08124
  • Pointer size: 131 Bytes
  • Size of remote file: 446 kB
assets/images/test.jpg ADDED

Git LFS Details

  • SHA256: 71549d76843c4ee220f37f45e87f0dfc22079d1bc5fbe3f52fe2ded2b9454a3b
  • Pointer size: 131 Bytes
  • Size of remote file: 143 kB
assets/images/test2.jpg ADDED
assets/images/test3.jpg ADDED

Git LFS Details

  • SHA256: bee71955dac07594b21937c2354ab5b7bd3f3321447202476178dab5ceead497
  • Pointer size: 131 Bytes
  • Size of remote file: 214 kB
assets/masks/test.png ADDED
assets/masks/test2.png ADDED
assets/materials/gr_infer_demo.jpg ADDED

Git LFS Details

  • SHA256: 9b4f0df3c602da88e707262029d78284b3b5857e2bac413edef6f117e3ddb8be
  • Pointer size: 131 Bytes
  • Size of remote file: 320 kB
assets/materials/gr_pre_demo.jpg ADDED

Git LFS Details

  • SHA256: 6939180a97bd5abfc8d90bef6b31e949c591e2d75f5719e0eac150871d4aaae2
  • Pointer size: 131 Bytes
  • Size of remote file: 267 kB
assets/materials/tasks.png ADDED

Git LFS Details

  • SHA256: 1f1c4b3f3e6ae927880fbe2f9a46939cc98824bb56c2753c975a2e3c4820830b
  • Pointer size: 131 Bytes
  • Size of remote file: 709 kB
assets/materials/teaser.jpg ADDED

Git LFS Details

  • SHA256: 87ce75e8dcbf1536674d3a951326727e0aff80192f52cf7388b34c03f13f711f
  • Pointer size: 131 Bytes
  • Size of remote file: 892 kB
assets/videos/test.mp4 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2195efbd92773f1ee262154577c700e9c3b7a4d7d04b1a2ac421db0879c696b0
3
+ size 737090
assets/videos/test2.mp4 ADDED
Binary file (79.6 kB). View file
 
pyproject.toml ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["setuptools>=42", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "vace"
7
+ version = "1.1.0"
8
+ description = "VACE: All-in-One Video Creation and Editing"
9
+ authors = [
10
+ { name = "VACE Team", email = "wan.ai@alibabacloud.com" }
11
+ ]
12
+ requires-python = ">=3.10,<4.0"
13
+ readme = "README.md"
14
+ dependencies = [
15
+ "torch>=2.5.1",
16
+ "torchvision>=0.20.1",
17
+ "opencv-python>=4.9.0.80",
18
+ "diffusers>=0.31.0",
19
+ "transformers>=4.49.0",
20
+ "tokenizers>=0.20.3",
21
+ "accelerate>=1.1.1",
22
+ "gradio>=5.0.0",
23
+ "numpy>=1.23.5,<2",
24
+ "tqdm",
25
+ "imageio",
26
+ "easydict",
27
+ "ftfy",
28
+ "dashscope",
29
+ "imageio-ffmpeg",
30
+ "flash_attn",
31
+ "decord",
32
+ "einops",
33
+ "scikit-image",
34
+ "scikit-learn",
35
+ "pycocotools",
36
+ "timm",
37
+ "onnxruntime-gpu",
38
+ "BeautifulSoup4"
39
+ ]
40
+
41
+ [project.optional-dependencies]
42
+ ltx = [
43
+ "ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1"
44
+ ]
45
+ wan = [
46
+ "wan@git+https://github.com/Wan-Video/Wan2.1"
47
+ ]
48
+ annotator = [
49
+ "insightface",
50
+ "sam-2@git+https://github.com/facebookresearch/sam2.git",
51
+ "segment-anything@git+https://github.com/facebookresearch/segment-anything.git",
52
+ "groundingdino@git+https://github.com/IDEA-Research/GroundingDINO.git",
53
+ "ram@git+https://github.com/xinyu1205/recognize-anything.git",
54
+ "raft@git+https://github.com/martin-chobanyan-sdc/RAFT.git"
55
+ ]
56
+
57
+ [project.urls]
58
+ homepage = "https://ali-vilab.github.io/VACE-Page/"
59
+ documentation = "https://ali-vilab.github.io/VACE-Page/"
60
+ repository = "https://github.com/ali-vilab/VACE"
61
+ hfmodel = "https://huggingface.co/collections/ali-vilab/vace-67eca186ff3e3564726aff38"
62
+ msmodel = "https://modelscope.cn/collections/VACE-8fa5fcfd386e43"
63
+ paper = "https://arxiv.org/abs/2503.07598"
64
+
65
+ [tool.setuptools]
66
+ packages = { find = {} }
67
+
68
+ [tool.black]
69
+ line-length = 88
70
+
71
+ [tool.isort]
72
+ profile = "black"
73
+
74
+ [tool.mypy]
75
+ strict = true
requirements.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ -r requirements/framework.txt
requirements/annotator.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ insightface
2
+ git+https://github.com/facebookresearch/sam2.git
3
+ git+https://github.com/facebookresearch/segment-anything.git
4
+ git+https://github.com/IDEA-Research/GroundingDINO.git
5
+ git+https://github.com/xinyu1205/recognize-anything.git
6
+ git+https://github.com/martin-chobanyan-sdc/RAFT.git
requirements/framework.txt ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.5.1
2
+ torchvision>=0.20.1
3
+ opencv-python>=4.9.0.80
4
+ diffusers>=0.31.0
5
+ transformers>=4.49.0
6
+ tokenizers>=0.20.3
7
+ accelerate>=1.1.1
8
+ gradio>=5.0.0
9
+ numpy>=1.23.5,<2
10
+ tqdm
11
+ imageio
12
+ easydict
13
+ ftfy
14
+ dashscope
15
+ imageio-ffmpeg
16
+ flash_attn
17
+ decord
18
+ einops
19
+ scikit-image
20
+ scikit-learn
21
+ pycocotools
22
+ timm
23
+ onnxruntime-gpu
24
+ BeautifulSoup4
25
+ #ltx-video@git+https://github.com/Lightricks/LTX-Video@ltx-video-0.9.1
26
+ #wan@git+https://github.com/Wan-Video/Wan2.1
run_vace_ltx.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #------------------------ Gadio ------------------------#
2
+ python vace/gradios/vace_ltx_demo.py
3
+
4
+ #------------------------ CLI ------------------------#
5
+ # txt2vid txt2vid
6
+ python vace/vace_ltx_inference.py --prompt "A sailboat with a white sail is navigating through rough, dark blue ocean waters under a stormy sky filled with thick, gray clouds. The boat tilts significantly as it rides the waves, and several seagulls fly around it. The scene is captured in real-life footage, with the camera angle shifting to follow the movement of the boat, emphasizing its struggle against the turbulent sea. The lighting is dim, reflecting the overcast conditions, and the overall tone is dramatic and intense."
7
+
8
+ # extension firstframe
9
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4" --prompt "A man in a black long-sleeve shirt is sitting inside a white vehicle, holding a walkie-talkie. He looks out the window with a serious expression. The camera gradually zooms in on his face, emphasizing his focused gaze. The background is blurred, but it appears to be an outdoor setting with some structures visible. The lighting is natural and bright, suggesting daytime. The scene is captured in real-life footage."
10
+
11
+ # repainting inpainting
12
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4" --prompt "A huge golden phoenix spread its wings and flew over the bustling city, its feathers shining brightly like flames, shimmering with warm radiance, and its wings spreading out majestic.The city below is filled with tall buildings adorned with colorful lights and billboards, creating a vibrant urban landscape. The camera follows the phoenix's flight from a high angle, capturing the grandeur of both the creature and the cityscape. The lighting is predominantly artificial, casting a warm glow on the buildings and streets, contrasting with the dark sky. The scene is a blend of animation and real-life footage, seamlessly integrating the fantastical element of the phoenix into a realistic city environment."
13
+
14
+ # repainting outpainting
15
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4" --prompt "The video begins with an aerial view of a grand, ancient gate illuminated by warm lights against the evening sky. The gate is surrounded by lush greenery and traditional Chinese architecture, including a prominent red-roofed building in the background. As the scene progresses, the gate's lighting intensifies, and a dynamic light show starts, featuring bright yellow and blue streaks emanating from the gate's archway, creating a visually striking effect. The light show continues to build in intensity, with more vibrant colors and patterns emerging. The camera angle remains static, capturing the entire spectacle from above. The lighting transitions from the natural dusk hues to the vivid, artificial lights of the display, enhancing the dramatic atmosphere. The scene is captured in real-life footage, showcasing the blend of historical architecture and modern light technology."
16
+
17
+ # control depth
18
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4" --prompt "In this enchanting animated scene, a group of young people gathers in a whimsical sky city to take a group photo, yet the photographer consistently captures the tender moments shared between couples. In the foreground, a young couple holds hands, while gazing into each other's eyes, smiles lighting up their faces. Surrounding them, vibrant hot air balloons float gracefully, and twinkling stars add a touch of magic to the atmosphere. The background features a dreamy sky, where warm sunlight filters through fluffy clouds, creating dappled shadows on the scene. The camera begins with a close-up, focusing on the couple's affectionate gestures, then slowly zooms out to reveal the warmth and vibrancy of the entire setting. The lighting is soft and romantic, casting a golden hue. The scene is captured in real-life footage"
19
+
20
+ # control flow
21
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4" --prompt "A bright red tomato was placed in a glass of milk, splashing water and creating ripples. The tomato sinks to the bottom of the glass, and the milk keeps shaking. The camera angle is a close-up shot, focusing on glass and milk. The bright and natural lighting highlights the pure white of the milk and the bright red of the tomatoes. This scene seems to be a real shot."
22
+
23
+ # control gray
24
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4" --prompt "A young woman with long, straight purple hair is standing in front of a lush autumn background. She is wearing an off-shoulder light yellow dress and smiling at the camera. The wind gently blows her hair to one side. The lighting is natural and bright, highlighting her features and the vibrant red and yellow leaves behind her. The scene is captured in real-life footage with a steady camera angle focusing on the woman's upper body."
25
+
26
+ # control pose
27
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4" --prompt "In a tropical celebration, a family gathers around a long table nestled under swaying palm trees, basking in the warmth of the sun. The table is laden with an array of exotic dishes, each colorful plate invitingly displayed. Elders engage in joyful conversations, their faces animated, while young adults raise their glasses in enthusiastic toasts. Children dash across the sandy beach. The background features a stunning azure ocean under a bright sun. The camera angle is in a dynamic mid-shot, fluidly capturing the moments of laughter and connection, while the lighting is bright and golden. The scene is presented in a realistic style."
28
+
29
+ # control scribble
30
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4" --prompt "In this visually stunning scene, a vivid, neon-colored drone zips past the surreal West Xi'an ancient city wall at a low altitude, kicking up a cloud of glittering dust that catches the sunlight in a spectrum of colors. The camera swiftly shifts to a close-up of the bricks on the wall, where warm sunlight illuminates each stone, revealing intricate textures that tell tales of history. The background is rich with the majestic, timeworn wall, blending seamlessly into a dreamy atmosphere. The camera angle is at a dynamic angle, following the drone's swift movements with smooth transitions. The lighting is bright and vibrant, casting a magical glow. This scene is realized in striking animation."
31
+
32
+ # control layout
33
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4" --prompt "A small bird with a grey head, white chest, and orange tail feathers lands on a nest in a tree. The nest is made of twigs and leaves and contains three baby birds with their mouths open, waiting to be fed. The adult bird feeds the baby birds one by one, then takes off from the nest. The background is a blurred green forest, providing a natural setting for the scene. The camera angle is steady, focusing on the nest and the birds, capturing the intimate moment of feeding. The lighting is bright and natural, highlighting the colors of the birds and the nest. The scene appears to be real-life footage."
34
+
35
+ # reference face
36
+ python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png" --prompt "The video unfolds with an elderly man sporting pointy ears, his long silver hair cascading down, and a neatly trimmed goatee, wearing a vibrant, colorful robe over a golden shirt that radiates an aura of mystery and wisdom. The background is the interior of a magnificent palace, shining brilliantly. The camera dynamically rotates to capture this enchanting moment from various angles. The lighting is bright casting a warm glow. This scene seems to be a real shot."
37
+
38
+ # reference object
39
+ python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png" --prompt "Classic game character Mario is submerged in a turquoise underwater world, surrounded by vibrant corals and various tropical fish. He jumps excitedly upwards, striking his iconic cheerful pose while wearing a bright blue wetsuit and a red diving mask adorned with an “M” logo. His feet are equipped with sturdy diving boots. In the background, bubbles drift with the currents, revealing a large and friendly starfish nearby. The camera moves swiftly from the seabed upwards, capturing the moment he breaks the surface of the water. The lighting is bright and flowing. The scene combines animated and fantastical elements, creating a visually stunning experience."
40
+
41
+ # composition reference_anything
42
+ python vace/vace_ltx_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png,benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png" --prompt "A man dressed as Superman stands confidently facing the camera, with a lively plush yellow duck perched on his shoulder. The man has neatly trimmed short hair and light stubble, while the duck features an orange beak and feet with slightly spread wings and legs positioned to maintain balance. The man's expression is serious and determined. He wears the iconic blue and red Superman costume, complete with a yellow "S" emblem on his chest and a cape flowing behind him. The background includes pedestrians walking by, adding to the scene's atmosphere. The camera is positioned at eye level, capturing the man's entire upper body. The lighting is bright and even, illuminating both the man and the duck. The scene appears to be real-life footage."
43
+
44
+ # composition swap_anything
45
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png" --prompt "The video depicts a person with long, pale purple hair riding a horse across a vast grassland. The individual wears traditional attire featuring a white top and black pants, styled in an animation modeling approach, suggesting engagement in some outdoor activity or performance. The backdrop showcases magnificent mountains under a sky dotted with clouds, imparting a serene and expansive atmosphere. The camera angle is fixed throughout the video, focusing on the rider and his horse as they move through the landscape. The lighting is natural, highlighting the serene majesty of the scene. The scene is animated, capturing the tranquil beauty of the vast plains and towering mountains."
46
+
47
+ # composition expand_anything
48
+ python vace/vace_ltx_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png" --prompt "Set in the style of a classical oil painting, the scene unfolds along the bank of a river. At the center of the frame is a mature and elegant woman seated on a chair, wearing a flowing long dress. She gracefully lifts her hands from her lap to don a pair of red, heart-shaped sunglasses. The background features the tranquil river and lush surroundings, enhancing the serene atmosphere. The camera remains fixed, emphasizing the woman and her refined actions. The lighting is soft and warm, imitating the natural illumination typical of oil paintings. The scene is animated to replicate the timeless beauty and charm of classical art."
run_vace_pipeline.sh ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #------------------------ Pipeline ------------------------#
2
+ # extension firstframe
3
+ python vace/vace_pipeline.py --base wan --task frameref --mode firstframe --image "benchmarks/VACE-Benchmark/assets/examples/firstframe/ori_image_1.png" --prompt "纪实摄影风格,前景是一位中国越野爱好者坐在越野车上,手持车载电台正在进行通联。他五官清晰,表情专注,眼神坚定地望向前方。越野车停在户外,车身略显脏污,显示出经历过的艰难路况。镜头从车外缓缓拉近,最后定格在人物的面部特写上,展现出他的坚定与热情。中景到近景,动态镜头运镜。"
4
+
5
+ # repainting inpainting
6
+ python vace/vace_pipeline.py --base wan --task inpainting --mode salientmasktrack --maskaug_mode original_expand --maskaug_ratio 0.5 --video "benchmarks/VACE-Benchmark/assets/examples/inpainting/ori_video.mp4" --prompt "一只巨大的金色凤凰从繁华的城市上空展翅飞过,羽毛如火焰般璀璨,闪烁着温暖的光辉,翅膀雄伟地展开。凤凰高昂着头,目光炯炯,轻轻扇动翅膀,散发出淡淡的光芒。下方是熙熙攘攘的市中心,人群惊叹,车水马龙,红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道,捕捉这一壮丽的景象,营造出既神秘又辉煌的氛围。"
7
+
8
+ # repainting outpainting
9
+ python vace/vace_pipeline.py --base wan --task outpainting --direction 'up,down,left,right' --expand_ratio 0.3 --video "benchmarks/VACE-Benchmark/assets/examples/outpainting/ori_video.mp4" --prompt "赛博朋克风格,无人机俯瞰视角下的现代西安城墙,镜头穿过永宁门时泛起金色涟漪,城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起,现代与古代的交融,城墙上的灯光闪烁,形成时空隧道的效果。全息投影技术展现历史变迁,粒子重组特效细腻逼真。大远景逐渐过渡到特写,聚焦于城门特效。"
10
+
11
+ # control depth
12
+ python vace/vace_pipeline.py --base wan --task depth --video "benchmarks/VACE-Benchmark/assets/examples/depth/ori_video.mp4" --prompt "一群年轻人在天空之城拍摄集体照。画面中,一对年轻情侣手牵手,轻声细语,相视而笑,周围是飞翔的彩色热气球和闪烁的星星,营造出浪漫的氛围。天空中,暖阳透过飘浮的云朵,洒下斑驳的光影。镜头以近景特写开始,随着情侣间的亲密互动,缓缓拉远。"
13
+
14
+ # control flow
15
+ python vace/vace_pipeline.py --base wan --task flow --video "benchmarks/VACE-Benchmark/assets/examples/flow/ori_video.mp4" --prompt "纪实摄影风格,一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中,溅起晶莹的水花。画面以慢镜头捕捉这一瞬间,水花在空中绽放,形成美丽的弧线。玻璃杯中的牛奶纯白,番茄的鲜红与之形成鲜明对比。背景简洁,突出主体。近景特写,垂直俯视视角,展现细节之美。"
16
+
17
+ # control gray
18
+ python vace/vace_pipeline.py --base wan --task gray --video "benchmarks/VACE-Benchmark/assets/examples/gray/ori_video.mp4" --prompt "镜头缓缓向右平移,身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬,眼神明亮而充满活力。背景是秋天红色和黄色的树叶,阳光透过树叶的缝隙洒下斑驳光影,营造出温馨自然的氛围。画面风格清新自然,仿佛夏日午后的一抹清凉。中景人像,强调自然光效和细腻的皮肤质感。"
19
+
20
+ # control pose
21
+ python vace/vace_pipeline.py --base wan --task pose --video "benchmarks/VACE-Benchmark/assets/examples/pose/ori_video.mp4" --prompt "在一个热带的庆祝派对上,一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈,年轻人兴奋地举杯碰撞,孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光,营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间,温暖的阳光映照着他们幸福的面庞。"
22
+
23
+ # control scribble
24
+ python vace/vace_pipeline.py --base wan --task scribble --video "benchmarks/VACE-Benchmark/assets/examples/scribble/ori_video.mp4" --prompt "画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙,尘埃反射着阳光。镜头快速切换至城墙上的砖石特写,阳光温暖地洒落,勾勒出每一块砖块的细腻纹理。整体画质清晰华丽,运镜流畅如水。"
25
+
26
+ # control layout
27
+ python vace/vace_pipeline.py --base wan --task layout_track --mode bboxtrack --bbox '54,200,614,448' --maskaug_mode bbox_expand --maskaug_ratio 0.2 --label 'bird' --video "benchmarks/VACE-Benchmark/assets/examples/layout/ori_video.mp4" --prompt "视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中,幼鸟张开嘴巴等待食物。随后,成鸟飞走,幼鸟继续等待。成鸟再次飞回,带回食物喂养幼鸟。���个视频的拍摄角度固定,聚焦于巢穴和鸟类的互动,背景是模糊的绿色植被,强调了鸟类的自然行为和生态环境。"
run_vace_preproccess.sh ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #------------------------ Gadio ------------------------#
2
+ python vace/gradios/vace_preproccess_demo.py
3
+
4
+ #------------------------ Video ------------------------#
5
+ python vace/vace_preproccess.py --task depth --video assets/videos/test.mp4
6
+ python vace/vace_preproccess.py --task flow --video assets/videos/test.mp4
7
+ python vace/vace_preproccess.py --task gray --video assets/videos/test.mp4
8
+ python vace/vace_preproccess.py --task pose --video assets/videos/test.mp4
9
+ python vace/vace_preproccess.py --task scribble --video assets/videos/test.mp4
10
+ python vace/vace_preproccess.py --task frameref --mode firstframe --image assets/images/test.jpg
11
+ python vace/vace_preproccess.py --task frameref --mode lastframe --expand_num 55 --image assets/images/test.jpg
12
+ python vace/vace_preproccess.py --task frameref --mode firstlastframe --image assets/images/test.jpg,assets/images/test2.jpg
13
+ python vace/vace_preproccess.py --task clipref --mode firstclip --expand_num 66 --video assets/videos/test.mp4
14
+ python vace/vace_preproccess.py --task clipref --mode lastclip --expand_num 55 --video assets/videos/test.mp4
15
+ python vace/vace_preproccess.py --task clipref --mode firstlastclip --video assets/videos/test.mp4,assets/videos/test2.mp4
16
+ python vace/vace_preproccess.py --task inpainting --mode salient --video assets/videos/test.mp4
17
+ python vace/vace_preproccess.py --task inpainting --mode mask --mask assets/masks/test.png --video assets/videos/test.mp4
18
+ python vace/vace_preproccess.py --task inpainting --mode bbox --bbox 50,50,550,700 --video assets/videos/test.mp4
19
+ python vace/vace_preproccess.py --task inpainting --mode salientmasktrack --video assets/videos/test.mp4
20
+ python vace/vace_preproccess.py --task inpainting --mode salientbboxtrack --video assets/videos/test.mp4
21
+ python vace/vace_preproccess.py --task inpainting --mode masktrack --mask assets/masks/test.png --video assets/videos/test.mp4
22
+ python vace/vace_preproccess.py --task inpainting --mode bboxtrack --bbox 50,50,550,700 --video assets/videos/test.mp4
23
+ python vace/vace_preproccess.py --task inpainting --mode label --label cat --video assets/videos/test.mp4
24
+ python vace/vace_preproccess.py --task inpainting --mode caption --caption 'boxing glove' --video assets/videos/test.mp4
25
+ python vace/vace_preproccess.py --task outpainting --video assets/videos/test.mp4
26
+ python vace/vace_preproccess.py --task outpainting --direction 'up,down,left,right' --expand_ratio 0.5 --video assets/videos/test.mp4
27
+ python vace/vace_preproccess.py --task layout_bbox --bbox '50,50,550,700 500,150,750,700' --label 'person'
28
+ python vace/vace_preproccess.py --task layout_track --mode masktrack --mask assets/masks/test.png --label 'cat' --video assets/videos/test.mp4
29
+ python vace/vace_preproccess.py --task layout_track --mode bboxtrack --bbox '50,50,550,700' --label 'cat' --video assets/videos/test.mp4
30
+ python vace/vace_preproccess.py --task layout_track --mode label --label 'cat' --maskaug_mode hull_expand --maskaug_ratio 0.1 --video assets/videos/test.mp4
31
+ python vace/vace_preproccess.py --task layout_track --mode caption --caption 'boxing glove' --maskaug_mode bbox --video assets/videos/test.mp4 --label 'glove'
32
+
33
+ #------------------------ Image ------------------------#
34
+ python vace/vace_preproccess.py --task image_face --image assets/images/test3.jpg
35
+ python vace/vace_preproccess.py --task image_salient --image assets/images/test.jpg
36
+ python vace/vace_preproccess.py --task image_inpainting --mode 'salientbboxtrack' --image assets/images/test2.jpg
37
+ python vace/vace_preproccess.py --task image_inpainting --mode 'salientmasktrack' --maskaug_mode hull_expand --maskaug_ratio 0.3 --image assets/images/test2.jpg
38
+ python vace/vace_preproccess.py --task image_reference --mode plain --image assets/images/test.jpg
39
+ python vace/vace_preproccess.py --task image_reference --mode salient --image assets/images/test.jpg
40
+ python vace/vace_preproccess.py --task image_reference --mode mask --mask assets/masks/test2.png --image assets/images/test.jpg
41
+ python vace/vace_preproccess.py --task image_reference --mode bbox --bbox 0,264,338,636 --image assets/images/test.jpg
42
+ python vace/vace_preproccess.py --task image_reference --mode salientmasktrack --image assets/images/test.jpg # easyway, recommend
43
+ python vace/vace_preproccess.py --task image_reference --mode salientbboxtrack --bbox 0,264,338,636 --maskaug_mode original_expand --maskaug_ratio 0.2 --image assets/images/test.jpg
44
+ python vace/vace_preproccess.py --task image_reference --mode masktrack --mask assets/masks/test2.png --image assets/images/test.jpg
45
+ python vace/vace_preproccess.py --task image_reference --mode bboxtrack --bbox 0,264,338,636 --image assets/images/test.jpg
46
+ python vace/vace_preproccess.py --task image_reference --mode label --label 'cat' --image assets/images/test.jpg
47
+ python vace/vace_preproccess.py --task image_reference --mode caption --caption 'flower' --maskaug_mode bbox --maskaug_ratio 0.3 --image assets/images/test.jpg
48
+
49
+ #------------------------ Composition ------------------------#
50
+ python vace/vace_preproccess.py --task reference_anything --mode salientmasktrack --image assets/images/test.jpg
51
+ python vace/vace_preproccess.py --task reference_anything --mode salientbboxtrack --image assets/images/test.jpg,assets/images/test2.jpg
52
+ python vace/vace_preproccess.py --task animate_anything --mode salientbboxtrack --video assets/videos/test.mp4 --image assets/images/test.jpg
53
+ python vace/vace_preproccess.py --task swap_anything --mode salientmasktrack --video assets/videos/test.mp4 --image assets/images/test.jpg
54
+ python vace/vace_preproccess.py --task swap_anything --mode label,salientbboxtrack --label 'cat' --maskaug_mode bbox --maskaug_ratio 0.3 --video assets/videos/test.mp4 --image assets/images/test.jpg
55
+ python vace/vace_preproccess.py --task swap_anything --mode label,plain --label 'cat' --maskaug_mode bbox --maskaug_ratio 0.3 --video assets/videos/test.mp4 --image assets/images/test.jpg
56
+ python vace/vace_preproccess.py --task expand_anything --mode salientbboxtrack --direction 'left,right' --expand_ratio 0.5 --expand_num 80 --image assets/images/test.jpg,assets/images/test2.jpg
57
+ python vace/vace_preproccess.py --task expand_anything --mode firstframe,plain --direction 'left,right' --expand_ratio 0.5 --expand_num 80 --image assets/images/test.jpg,assets/images/test2.jpg
58
+ python vace/vace_preproccess.py --task move_anything --bbox '0,264,338,636 400,264,538,636' --expand_num 80 --label 'cat' --image assets/images/test.jpg
run_vace_wan.sh ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #------------------------ Gadio ------------------------#
2
+ python vace/gradios/vace_wan_demo.py
3
+
4
+ #------------------------ CLI ------------------------#
5
+ # txt2vid txt2vid
6
+ python vace/vace_wan_inference.py --prompt "狂风巨浪的大海,镜头缓缓推进,一艘渺小的帆船在汹涌的波涛中挣扎漂荡。海面上白沫翻滚,帆船时隐时现,仿佛随时可能被巨浪吞噬。天空乌云密布,雷声轰鸣,海鸥在空中盘旋尖叫。帆船上的人们紧紧抓住缆绳,努力保持平衡。画面风格写实,充满紧张和动感。近景特写,强调风浪的冲击力和帆船的摇晃"
7
+
8
+ # extension firstframe
9
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/firstframe/src_mask.mp4" --prompt "纪实摄影风格,前景是一位中国越野爱好者坐在越野车上,手持车载电台正在进行通联。他五官清晰,表情专注,眼神坚定地望向前方。越野车停在户外,车身略显脏污,显示出经历过的艰难路况。镜头从车外缓缓拉近,最后定格在人物的面部特写上,展现出他的,动态镜头运镜。"
10
+
11
+ # repainting inpainting
12
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/inpainting/src_mask.mp4" --prompt "一只巨大的金色凤凰从繁华的城市上空展翅飞过,羽毛如火焰般璀璨,闪烁着温暖的光辉,翅膀雄伟地展开。凤凰高昂着头,目光炯炯,轻轻扇动翅膀,散发出淡淡的光芒。下方是熙熙攘攘的市中心,人群惊叹,车水马龙,红蓝两色的霓虹灯在夜空下闪烁。镜头俯视城市街道,捕捉这一壮丽的景象,营造出既神秘又辉煌的氛围。"
13
+
14
+ # repainting outpainting
15
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/outpainting/src_mask.mp4" --prompt "赛博朋克风格,无人机俯瞰视角下的现代西安城墙,镜头穿过永宁门时泛起金色涟漪,城墙砖块化作数据流重组为唐代长安城。周围的街道上流动的人群和飞驰的机械交通工具交织在一起,现代与古代的交融,城墙上的灯光闪烁,形成时空隧道的效果。全息投影技术展现历史变迁,粒子重组特效细腻逼真。大远景逐渐过渡到特写,聚焦于城门特效。"
16
+
17
+ # control depth
18
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/depth/src_video.mp4" --prompt "一群年轻人在天空之城拍摄集体照。画面中,一对年轻情侣手牵手,轻声细语,相视而笑,周围是飞翔的彩色热气球和闪烁的星星,营造出浪漫的氛围。天空中,暖阳透过飘浮的云朵,洒下斑驳的光影。镜头以近景特写开始,随着情侣间的亲密互动,缓缓拉远。"
19
+
20
+ # control flow
21
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/flow/src_video.mp4" --prompt "纪实摄影风格,一颗鲜红的小番茄缓缓落入盛着牛奶的玻璃杯中,溅起晶莹的水花。画面以慢镜头捕捉这一瞬间,水花在空中绽放,形成美丽的弧线。玻璃杯中的牛奶纯白,番茄的鲜红与之形成鲜明对比。背景简洁,突出主体。近景特写,垂直俯视视角,展现细节之美。"
22
+
23
+ # control gray
24
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/gray/src_video.mp4" --prompt "镜头缓缓向右平移,身穿淡黄色坎肩长裙的长发女孩面对镜头露出灿烂的漏齿微笑。她的长发随风轻扬,眼神明亮而充满活力。背景是秋天红色和黄色的树叶,阳光透过树叶的缝隙洒下斑驳光影,营造出温馨自然的氛围。画面风格清新自然,仿佛夏日午后的一抹清凉。中景人像,强调自然光效和细腻的皮肤质感。"
25
+
26
+ # control pose
27
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/pose/src_video.mp4" --prompt "在一个热带的庆祝派对上,一家人围坐在椰子树下的长桌旁。桌上摆满了异国风味的美食。长辈们愉悦地交谈,年轻人兴奋地举杯碰撞,孩子们在沙滩上欢乐奔跑。背景中是湛蓝的海洋和明亮的阳光,营造出轻松的气氛。镜头以动态中景捕捉每个开心的瞬间,温暖的阳光映照着他们幸福的面庞。"
28
+
29
+ # control scribble
30
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/scribble/src_video.mp4" --prompt "画面中荧光色彩的无人机从极低空高速掠过超现实主义风格的西安古城墙,尘埃反射着阳光。镜头快速切换至城墙上的砖石特写,阳光温暖地洒落,勾勒出每一块砖块的细腻纹理。整体画质清晰华丽,运镜流畅如水���"
31
+
32
+ # control layout
33
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/layout/src_video.mp4" --prompt "视频展示了一只成鸟在树枝上的巢中喂养它的幼鸟。成鸟在喂食的过程中,幼鸟张开嘴巴等待食物。随后,成鸟飞走,幼鸟继续等待。成鸟再次飞回,带回食物喂养幼鸟。整个视频的拍摄角度固定,聚焦于巢穴和鸟类的互动,背景是模糊的绿色植被,强调了鸟类的自然行为和生态环境。"
34
+
35
+ # reference face
36
+ python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/face/src_ref_image_1.png" --prompt "视频展示了一位长着尖耳朵的老人,他有一头银白色的长发和小胡子,穿着一件色彩斑斓的长袍,内搭金色衬衫,散发出神秘与智慧的气息。背景为一个华丽宫殿的内部,金碧辉煌。灯光明亮,照亮他脸上的神采奕奕。摄像机旋转动态拍摄,捕捉老人轻松挥手的动作。"
37
+
38
+ # reference object
39
+ python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/object/src_ref_image_1.png" --prompt "经典游戏角色马里奥在绿松石色水下世界中,四周环绕着珊瑚和各种各样的热带鱼。马里奥兴奋地向上跳起,摆出经典的欢快姿势,身穿鲜明的蓝色潜水服,红色的潜水面罩上印有“M”标志,脚上是一双潜水靴。背景中,水泡随波逐流,浮现出一个巨大而友好的海星。摄像机从水底向上快速移动,捕捉他跃出水面的瞬间,灯光明亮而流动。该场景融合了动画与幻想元素,令人惊叹。"
40
+
41
+ # composition reference_anything
42
+ python vace/vace_wan_inference.py --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_1.png,benchmarks/VACE-Benchmark/assets/examples/reference_anything/src_ref_image_2.png" --prompt "一名打扮成超人的男子自信地站着,面对镜头,肩头有一只充满活力的毛绒黄色鸭子。他留着整齐的短发和浅色胡须,鸭子有橙色的喙和脚,它的翅膀稍微展开,脚分开以保持稳定。他的表情严肃而坚定。他穿着标志性的蓝红超人服装,胸前有黄色“S”标志。斗篷在他身后飘逸。背景有行人。相机位于视线水平,捕捉角色的整个上半身。灯光均匀明亮。"
43
+
44
+ # composition swap_anything
45
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/swap_anything/src_ref_image_1.png" --prompt "视频展示了一个人在宽阔的草原上骑马。他有淡紫色长发,穿着传统服饰白上衣黑裤子,动画建模画风,看起来像是在进行某种户外活动或者是在进行某种表演。背景是壮观的山脉云的天空,给人一种宁静而广阔的感觉。整个视频的拍摄角度是固定的,重点展示了骑手和他的马。"
46
+
47
+ # composition expand_anything
48
+ python vace/vace_wan_inference.py --src_video "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_video.mp4" --src_mask "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_mask.mp4" --src_ref_images "benchmarks/VACE-Benchmark/assets/examples/expand_anything/src_ref_image_1.png" --prompt "古典油画风格,背景是一条河边,画面中央一位成熟优雅的女人,穿着长裙坐在椅子上。她双手从怀里取出打开的红色心形墨镜戴上。固定机位。"
tests/test_annotators.py ADDED
@@ -0,0 +1,568 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+
4
+ import os
5
+ import unittest
6
+ import numpy as np
7
+ from PIL import Image
8
+
9
+ from vace.annotators.utils import read_video_frames
10
+ from vace.annotators.utils import save_one_video
11
+
12
+ class AnnotatorTest(unittest.TestCase):
13
+ def setUp(self):
14
+ print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
15
+ self.save_dir = './cache/test_annotator'
16
+ if not os.path.exists(self.save_dir):
17
+ os.makedirs(self.save_dir)
18
+ # load test image
19
+ self.image_path = './assets/images/test.jpg'
20
+ self.image = Image.open(self.image_path).convert('RGB')
21
+ # load test video
22
+ self.video_path = './assets/videos/test.mp4'
23
+ self.frames = read_video_frames(self.video_path)
24
+
25
+ def tearDown(self):
26
+ super().tearDown()
27
+
28
+ @unittest.skip('')
29
+ def test_annotator_gray_image(self):
30
+ from vace.annotators.gray import GrayAnnotator
31
+ cfg_dict = {}
32
+ anno_ins = GrayAnnotator(cfg_dict)
33
+ anno_image = anno_ins.forward(np.array(self.image))
34
+ save_path = os.path.join(self.save_dir, 'test_gray_image.png')
35
+ Image.fromarray(anno_image).save(save_path)
36
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
37
+
38
+ @unittest.skip('')
39
+ def test_annotator_gray_video(self):
40
+ from vace.annotators.gray import GrayAnnotator
41
+ cfg_dict = {}
42
+ anno_ins = GrayAnnotator(cfg_dict)
43
+ ret_frames = []
44
+ for frame in self.frames:
45
+ anno_frame = anno_ins.forward(np.array(frame))
46
+ ret_frames.append(anno_frame)
47
+ save_path = os.path.join(self.save_dir, 'test_gray_video.mp4')
48
+ save_one_video(save_path, ret_frames, fps=16)
49
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
50
+
51
+ @unittest.skip('')
52
+ def test_annotator_gray_video_2(self):
53
+ from vace.annotators.gray import GrayVideoAnnotator
54
+ cfg_dict = {}
55
+ anno_ins = GrayVideoAnnotator(cfg_dict)
56
+ ret_frames = anno_ins.forward(self.frames)
57
+ save_path = os.path.join(self.save_dir, 'test_gray_video_2.mp4')
58
+ save_one_video(save_path, ret_frames, fps=16)
59
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
60
+
61
+
62
+ @unittest.skip('')
63
+ def test_annotator_pose_image(self):
64
+ from vace.annotators.pose import PoseBodyFaceAnnotator
65
+ cfg_dict = {
66
+ "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
67
+ "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
68
+ "RESIZE_SIZE": 1024
69
+ }
70
+ anno_ins = PoseBodyFaceAnnotator(cfg_dict)
71
+ anno_image = anno_ins.forward(np.array(self.image))
72
+ save_path = os.path.join(self.save_dir, 'test_pose_image.png')
73
+ Image.fromarray(anno_image).save(save_path)
74
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
75
+
76
+ @unittest.skip('')
77
+ def test_annotator_pose_video(self):
78
+ from vace.annotators.pose import PoseBodyFaceAnnotator
79
+ cfg_dict = {
80
+ "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
81
+ "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
82
+ "RESIZE_SIZE": 1024
83
+ }
84
+ anno_ins = PoseBodyFaceAnnotator(cfg_dict)
85
+ ret_frames = []
86
+ for frame in self.frames:
87
+ anno_frame = anno_ins.forward(np.array(frame))
88
+ ret_frames.append(anno_frame)
89
+ save_path = os.path.join(self.save_dir, 'test_pose_video.mp4')
90
+ save_one_video(save_path, ret_frames, fps=16)
91
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
92
+
93
+ @unittest.skip('')
94
+ def test_annotator_pose_video_2(self):
95
+ from vace.annotators.pose import PoseBodyFaceVideoAnnotator
96
+ cfg_dict = {
97
+ "DETECTION_MODEL": "models/VACE-Annotators/pose/yolox_l.onnx",
98
+ "POSE_MODEL": "models/VACE-Annotators/pose/dw-ll_ucoco_384.onnx",
99
+ "RESIZE_SIZE": 1024
100
+ }
101
+ anno_ins = PoseBodyFaceVideoAnnotator(cfg_dict)
102
+ ret_frames = anno_ins.forward(self.frames)
103
+ save_path = os.path.join(self.save_dir, 'test_pose_video_2.mp4')
104
+ save_one_video(save_path, ret_frames, fps=16)
105
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
106
+
107
+ # @unittest.skip('')
108
+ def test_annotator_depth_image(self):
109
+ from vace.annotators.depth import DepthAnnotator
110
+ cfg_dict = {
111
+ "PRETRAINED_MODEL": "models/VACE-Annotators/depth/depth_anything_v2_vitl.pth"
112
+ }
113
+ anno_ins = DepthAnnotator(cfg_dict)
114
+ anno_image = anno_ins.forward(np.array(self.image))
115
+ save_path = os.path.join(self.save_dir, 'test_depth_image.png')
116
+ Image.fromarray(anno_image).save(save_path)
117
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
118
+
119
+ # @unittest.skip('')
120
+ def test_annotator_depth_video(self):
121
+ from vace.annotators.depth import DepthAnnotator
122
+ cfg_dict = {
123
+ "PRETRAINED_MODEL": "models/VACE-Annotators/depth/depth_anything_v2_vitl.pth"
124
+ }
125
+ anno_ins = DepthAnnotator(cfg_dict)
126
+ ret_frames = []
127
+ for frame in self.frames:
128
+ anno_frame = anno_ins.forward(np.array(frame))
129
+ ret_frames.append(anno_frame)
130
+ save_path = os.path.join(self.save_dir, 'test_depth_video.mp4')
131
+ save_one_video(save_path, ret_frames, fps=16)
132
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
133
+
134
+ @unittest.skip('')
135
+ def test_annotator_depth_video_2(self):
136
+ from vace.annotators.depth import DepthVideoAnnotator
137
+ cfg_dict = {
138
+ "PRETRAINED_MODEL": "models/VACE-Annotators/depth/dpt_hybrid-midas-501f0c75.pt"
139
+ }
140
+ anno_ins = DepthVideoAnnotator(cfg_dict)
141
+ ret_frames = anno_ins.forward(self.frames)
142
+ save_path = os.path.join(self.save_dir, 'test_depth_video_2.mp4')
143
+ save_one_video(save_path, ret_frames, fps=16)
144
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
145
+
146
+ @unittest.skip('')
147
+ def test_annotator_scribble_image(self):
148
+ from vace.annotators.scribble import ScribbleAnnotator
149
+ cfg_dict = {
150
+ "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
151
+ }
152
+ anno_ins = ScribbleAnnotator(cfg_dict)
153
+ anno_image = anno_ins.forward(np.array(self.image))
154
+ save_path = os.path.join(self.save_dir, 'test_scribble_image.png')
155
+ Image.fromarray(anno_image).save(save_path)
156
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
157
+
158
+ @unittest.skip('')
159
+ def test_annotator_scribble_video(self):
160
+ from vace.annotators.scribble import ScribbleAnnotator
161
+ cfg_dict = {
162
+ "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
163
+ }
164
+ anno_ins = ScribbleAnnotator(cfg_dict)
165
+ ret_frames = []
166
+ for frame in self.frames:
167
+ anno_frame = anno_ins.forward(np.array(frame))
168
+ ret_frames.append(anno_frame)
169
+ save_path = os.path.join(self.save_dir, 'test_scribble_video.mp4')
170
+ save_one_video(save_path, ret_frames, fps=16)
171
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
172
+
173
+ @unittest.skip('')
174
+ def test_annotator_scribble_video_2(self):
175
+ from vace.annotators.scribble import ScribbleVideoAnnotator
176
+ cfg_dict = {
177
+ "PRETRAINED_MODEL": "models/VACE-Annotators/scribble/anime_style/netG_A_latest.pth"
178
+ }
179
+ anno_ins = ScribbleVideoAnnotator(cfg_dict)
180
+ ret_frames = anno_ins.forward(self.frames)
181
+ save_path = os.path.join(self.save_dir, 'test_scribble_video_2.mp4')
182
+ save_one_video(save_path, ret_frames, fps=16)
183
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
184
+
185
+ @unittest.skip('')
186
+ def test_annotator_flow_video(self):
187
+ from vace.annotators.flow import FlowVisAnnotator
188
+ cfg_dict = {
189
+ "PRETRAINED_MODEL": "models/VACE-Annotators/flow/raft-things.pth"
190
+ }
191
+ anno_ins = FlowVisAnnotator(cfg_dict)
192
+ ret_frames = anno_ins.forward(self.frames)
193
+ save_path = os.path.join(self.save_dir, 'test_flow_video.mp4')
194
+ save_one_video(save_path, ret_frames, fps=16)
195
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
196
+
197
+ @unittest.skip('')
198
+ def test_annotator_frameref_video_1(self):
199
+ from vace.annotators.frameref import FrameRefExtractAnnotator
200
+ cfg_dict = {
201
+ "REF_CFG": [{"mode": "first", "proba": 0.1},
202
+ {"mode": "last", "proba": 0.1},
203
+ {"mode": "firstlast", "proba": 0.1},
204
+ {"mode": "random", "proba": 0.1}],
205
+ }
206
+ anno_ins = FrameRefExtractAnnotator(cfg_dict)
207
+ ret_frames, ret_masks = anno_ins.forward(self.frames, ref_num=10)
208
+ save_path = os.path.join(self.save_dir, 'test_frameref_video_1.mp4')
209
+ save_one_video(save_path, ret_frames, fps=16)
210
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
211
+ save_path = os.path.join(self.save_dir, 'test_frameref_mask_1.mp4')
212
+ save_one_video(save_path, ret_masks, fps=16)
213
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
214
+
215
+ @unittest.skip('')
216
+ def test_annotator_frameref_video_2(self):
217
+ from vace.annotators.frameref import FrameRefExpandAnnotator
218
+ cfg_dict = {}
219
+ anno_ins = FrameRefExpandAnnotator(cfg_dict)
220
+ ret_frames, ret_masks = anno_ins.forward(frames=self.frames, mode='lastclip', expand_num=50)
221
+ save_path = os.path.join(self.save_dir, 'test_frameref_video_2.mp4')
222
+ save_one_video(save_path, ret_frames, fps=16)
223
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
224
+ save_path = os.path.join(self.save_dir, 'test_frameref_mask_2.mp4')
225
+ save_one_video(save_path, ret_masks, fps=16)
226
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
227
+
228
+
229
+ @unittest.skip('')
230
+ def test_annotator_outpainting_1(self):
231
+ from vace.annotators.outpainting import OutpaintingAnnotator
232
+ cfg_dict = {
233
+ "RETURN_MASK": True,
234
+ "KEEP_PADDING_RATIO": 1,
235
+ "MASK_COLOR": "gray"
236
+ }
237
+ anno_ins = OutpaintingAnnotator(cfg_dict)
238
+ ret_data = anno_ins.forward(self.image, direction=['right', 'up', 'down'], expand_ratio=0.5)
239
+ save_path = os.path.join(self.save_dir, 'test_outpainting_image.png')
240
+ Image.fromarray(ret_data['image']).save(save_path)
241
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
242
+ save_path = os.path.join(self.save_dir, 'test_outpainting_mask.png')
243
+ Image.fromarray(ret_data['mask']).save(save_path)
244
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
245
+
246
+ @unittest.skip('')
247
+ def test_annotator_outpainting_video_1(self):
248
+ from vace.annotators.outpainting import OutpaintingVideoAnnotator
249
+ cfg_dict = {
250
+ "RETURN_MASK": True,
251
+ "KEEP_PADDING_RATIO": 1,
252
+ "MASK_COLOR": "gray"
253
+ }
254
+ anno_ins = OutpaintingVideoAnnotator(cfg_dict)
255
+ ret_data = anno_ins.forward(frames=self.frames, direction=['right', 'up', 'down'], expand_ratio=0.5)
256
+ save_path = os.path.join(self.save_dir, 'test_outpainting_video_1.mp4')
257
+ save_one_video(save_path, ret_data['frames'], fps=16)
258
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
259
+ save_path = os.path.join(self.save_dir, 'test_outpainting_mask_1.mp4')
260
+ save_one_video(save_path, ret_data['masks'], fps=16)
261
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
262
+
263
+ @unittest.skip('')
264
+ def test_annotator_outpainting_inner_1(self):
265
+ from vace.annotators.outpainting import OutpaintingInnerAnnotator
266
+ cfg_dict = {
267
+ "RETURN_MASK": True,
268
+ "KEEP_PADDING_RATIO": 1,
269
+ "MASK_COLOR": "gray"
270
+ }
271
+ anno_ins = OutpaintingInnerAnnotator(cfg_dict)
272
+ ret_data = anno_ins.forward(self.image, direction=['right', 'up', 'down'], expand_ratio=0.15)
273
+ save_path = os.path.join(self.save_dir, 'test_outpainting_inner_image.png')
274
+ Image.fromarray(ret_data['image']).save(save_path)
275
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
276
+ save_path = os.path.join(self.save_dir, 'test_outpainting_inner_mask.png')
277
+ Image.fromarray(ret_data['mask']).save(save_path)
278
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
279
+
280
+ @unittest.skip('')
281
+ def test_annotator_outpainting_inner_video_1(self):
282
+ from vace.annotators.outpainting import OutpaintingInnerVideoAnnotator
283
+ cfg_dict = {
284
+ "RETURN_MASK": True,
285
+ "KEEP_PADDING_RATIO": 1,
286
+ "MASK_COLOR": "gray"
287
+ }
288
+ anno_ins = OutpaintingInnerVideoAnnotator(cfg_dict)
289
+ ret_data = anno_ins.forward(self.frames, direction=['right', 'up', 'down'], expand_ratio=0.15)
290
+ save_path = os.path.join(self.save_dir, 'test_outpainting_inner_video_1.mp4')
291
+ save_one_video(save_path, ret_data['frames'], fps=16)
292
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
293
+ save_path = os.path.join(self.save_dir, 'test_outpainting_inner_mask_1.mp4')
294
+ save_one_video(save_path, ret_data['masks'], fps=16)
295
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
296
+
297
+ @unittest.skip('')
298
+ def test_annotator_salient(self):
299
+ from vace.annotators.salient import SalientAnnotator
300
+ cfg_dict = {
301
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
302
+ }
303
+ anno_ins = SalientAnnotator(cfg_dict)
304
+ ret_data = anno_ins.forward(self.image)
305
+ save_path = os.path.join(self.save_dir, 'test_salient_image.png')
306
+ Image.fromarray(ret_data).save(save_path)
307
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
308
+
309
+ @unittest.skip('')
310
+ def test_annotator_salient_video(self):
311
+ from vace.annotators.salient import SalientVideoAnnotator
312
+ cfg_dict = {
313
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
314
+ }
315
+ anno_ins = SalientVideoAnnotator(cfg_dict)
316
+ ret_frames = anno_ins.forward(self.frames)
317
+ save_path = os.path.join(self.save_dir, 'test_salient_video.mp4')
318
+ save_one_video(save_path, ret_frames, fps=16)
319
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
320
+
321
+ @unittest.skip('')
322
+ def test_annotator_layout_video(self):
323
+ from vace.annotators.layout import LayoutBboxAnnotator
324
+ cfg_dict = {
325
+ "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
326
+ }
327
+ anno_ins = LayoutBboxAnnotator(cfg_dict)
328
+ ret_frames = anno_ins.forward(bbox=[(544, 288, 744, 680), (1112, 240, 1280, 712)], frame_size=(720, 1280), num_frames=49, label='person')
329
+ save_path = os.path.join(self.save_dir, 'test_layout_video.mp4')
330
+ save_one_video(save_path, ret_frames, fps=16)
331
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
332
+
333
+ @unittest.skip('')
334
+ def test_annotator_layout_mask_video(self):
335
+ # salient
336
+ from vace.annotators.salient import SalientVideoAnnotator
337
+ cfg_dict = {
338
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
339
+ }
340
+ anno_ins = SalientVideoAnnotator(cfg_dict)
341
+ salient_frames = anno_ins.forward(self.frames)
342
+
343
+ # mask layout
344
+ from vace.annotators.layout import LayoutMaskAnnotator
345
+ cfg_dict = {
346
+ "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
347
+ }
348
+ anno_ins = LayoutMaskAnnotator(cfg_dict)
349
+ ret_frames = anno_ins.forward(salient_frames, label='cat')
350
+ save_path = os.path.join(self.save_dir, 'test_mask_layout_video.mp4')
351
+ save_one_video(save_path, ret_frames, fps=16)
352
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
353
+
354
+ @unittest.skip('')
355
+ def test_annotator_layout_mask_video_2(self):
356
+ # salient
357
+ from vace.annotators.salient import SalientVideoAnnotator
358
+ cfg_dict = {
359
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
360
+ }
361
+ anno_ins = SalientVideoAnnotator(cfg_dict)
362
+ salient_frames = anno_ins.forward(self.frames)
363
+
364
+ # mask layout
365
+ from vace.annotators.layout import LayoutMaskAnnotator
366
+ cfg_dict = {
367
+ "RAM_TAG_COLOR_PATH": "models/VACE-Annotators/layout/ram_tag_color_list.txt",
368
+ "USE_AUG": True
369
+ }
370
+ anno_ins = LayoutMaskAnnotator(cfg_dict)
371
+ ret_frames = anno_ins.forward(salient_frames, label='cat', mask_cfg={'mode': 'bbox_expand'})
372
+ save_path = os.path.join(self.save_dir, 'test_mask_layout_video_2.mp4')
373
+ save_one_video(save_path, ret_frames, fps=16)
374
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
375
+
376
+
377
+ @unittest.skip('')
378
+ def test_annotator_maskaug_video(self):
379
+ # salient
380
+ from vace.annotators.salient import SalientVideoAnnotator
381
+ cfg_dict = {
382
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
383
+ }
384
+ anno_ins = SalientVideoAnnotator(cfg_dict)
385
+ salient_frames = anno_ins.forward(self.frames)
386
+
387
+ # mask aug
388
+ from vace.annotators.maskaug import MaskAugAnnotator
389
+ cfg_dict = {}
390
+ anno_ins = MaskAugAnnotator(cfg_dict)
391
+ ret_frames = anno_ins.forward(salient_frames, mask_cfg={'mode': 'hull_expand'})
392
+ save_path = os.path.join(self.save_dir, 'test_maskaug_video.mp4')
393
+ save_one_video(save_path, ret_frames, fps=16)
394
+ print(('Testing %s: %s' % (type(self).__name__, save_path)))
395
+
396
+
397
+ @unittest.skip('')
398
+ def test_annotator_ram(self):
399
+ from vace.annotators.ram import RAMAnnotator
400
+ cfg_dict = {
401
+ "TOKENIZER_PATH": "models/VACE-Annotators/ram/bert-base-uncased",
402
+ "PRETRAINED_MODEL": "models/VACE-Annotators/ram/ram_plus_swin_large_14m.pth",
403
+ }
404
+ anno_ins = RAMAnnotator(cfg_dict)
405
+ ret_data = anno_ins.forward(self.image)
406
+ print(ret_data)
407
+
408
+ @unittest.skip('')
409
+ def test_annotator_gdino_v1(self):
410
+ from vace.annotators.gdino import GDINOAnnotator
411
+ cfg_dict = {
412
+ "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
413
+ "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
414
+ "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
415
+ }
416
+ anno_ins = GDINOAnnotator(cfg_dict)
417
+ ret_data = anno_ins.forward(self.image, caption="a cat and a vase")
418
+ print(ret_data)
419
+
420
+ @unittest.skip('')
421
+ def test_annotator_gdino_v2(self):
422
+ from vace.annotators.gdino import GDINOAnnotator
423
+ cfg_dict = {
424
+ "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
425
+ "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
426
+ "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
427
+ }
428
+ anno_ins = GDINOAnnotator(cfg_dict)
429
+ ret_data = anno_ins.forward(self.image, classes=["cat", "vase"])
430
+ print(ret_data)
431
+
432
+ @unittest.skip('')
433
+ def test_annotator_gdino_with_ram(self):
434
+ from vace.annotators.gdino import GDINORAMAnnotator
435
+ cfg_dict = {
436
+ "RAM": {
437
+ "TOKENIZER_PATH": "models/VACE-Annotators/ram/bert-base-uncased",
438
+ "PRETRAINED_MODEL": "models/VACE-Annotators/ram/ram_plus_swin_large_14m.pth",
439
+ },
440
+ "GDINO": {
441
+ "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
442
+ "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
443
+ "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
444
+ }
445
+
446
+ }
447
+ anno_ins = GDINORAMAnnotator(cfg_dict)
448
+ ret_data = anno_ins.forward(self.image)
449
+ print(ret_data)
450
+
451
+ @unittest.skip('')
452
+ def test_annotator_sam2(self):
453
+ from vace.annotators.sam2 import SAM2VideoAnnotator
454
+ from vace.annotators.utils import save_sam2_video
455
+ cfg_dict = {
456
+ "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
457
+ "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
458
+ }
459
+ anno_ins = SAM2VideoAnnotator(cfg_dict)
460
+ ret_data = anno_ins.forward(video=self.video_path, input_box=[0, 0, 640, 480])
461
+ video_segments = ret_data['annotations']
462
+ save_path = os.path.join(self.save_dir, 'test_sam2_video')
463
+ if not os.path.exists(save_path):
464
+ os.makedirs(save_path)
465
+ save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
466
+ print(save_path)
467
+
468
+
469
+ @unittest.skip('')
470
+ def test_annotator_sam2salient(self):
471
+ from vace.annotators.sam2 import SAM2SalientVideoAnnotator
472
+ from vace.annotators.utils import save_sam2_video
473
+ cfg_dict = {
474
+ "SALIENT": {
475
+ "PRETRAINED_MODEL": "models/VACE-Annotators/salient/u2net.pt",
476
+ },
477
+ "SAM2": {
478
+ "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
479
+ "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
480
+ }
481
+
482
+ }
483
+ anno_ins = SAM2SalientVideoAnnotator(cfg_dict)
484
+ ret_data = anno_ins.forward(video=self.video_path)
485
+ video_segments = ret_data['annotations']
486
+ save_path = os.path.join(self.save_dir, 'test_sam2salient_video')
487
+ if not os.path.exists(save_path):
488
+ os.makedirs(save_path)
489
+ save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
490
+ print(save_path)
491
+
492
+
493
+ @unittest.skip('')
494
+ def test_annotator_sam2gdinoram_video(self):
495
+ from vace.annotators.sam2 import SAM2GDINOVideoAnnotator
496
+ from vace.annotators.utils import save_sam2_video
497
+ cfg_dict = {
498
+ "GDINO": {
499
+ "TOKENIZER_PATH": "models/VACE-Annotators/gdino/bert-base-uncased",
500
+ "CONFIG_PATH": "models/VACE-Annotators/gdino/GroundingDINO_SwinT_OGC_mod.py",
501
+ "PRETRAINED_MODEL": "models/VACE-Annotators/gdino/groundingdino_swint_ogc.pth",
502
+ },
503
+ "SAM2": {
504
+ "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
505
+ "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
506
+ }
507
+ }
508
+ anno_ins = SAM2GDINOVideoAnnotator(cfg_dict)
509
+ ret_data = anno_ins.forward(video=self.video_path, classes='cat')
510
+ video_segments = ret_data['annotations']
511
+ save_path = os.path.join(self.save_dir, 'test_sam2gdino_video')
512
+ if not os.path.exists(save_path):
513
+ os.makedirs(save_path)
514
+ save_sam2_video(video_path=self.video_path, video_segments=video_segments, output_video_path=save_path)
515
+ print(save_path)
516
+
517
+ @unittest.skip('')
518
+ def test_annotator_sam2_image(self):
519
+ from vace.annotators.sam2 import SAM2ImageAnnotator
520
+ cfg_dict = {
521
+ "CONFIG_PATH": 'models/VACE-Annotators/sam2/configs/sam2.1/sam2.1_hiera_l.yaml',
522
+ "PRETRAINED_MODEL": 'models/VACE-Annotators/sam2/sam2.1_hiera_large.pt'
523
+ }
524
+ anno_ins = SAM2ImageAnnotator(cfg_dict)
525
+ ret_data = anno_ins.forward(image=self.image, input_box=[0, 0, 640, 480])
526
+ print(ret_data)
527
+
528
+ @unittest.skip('')
529
+ def test_annotator_prompt_extend(self):
530
+ from vace.annotators.prompt_extend import PromptExtendAnnotator
531
+ from vace.configs.prompt_preprocess import WAN_LM_ZH_SYS_PROMPT, WAN_LM_EN_SYS_PROMPT, LTX_LM_EN_SYS_PROMPT
532
+ cfg_dict = {
533
+ "MODEL_NAME": "models/VACE-Annotators/llm/Qwen2.5-3B-Instruct" # "Qwen2.5_3B"
534
+ }
535
+ anno_ins = PromptExtendAnnotator(cfg_dict)
536
+ ret_data = anno_ins.forward('一位男孩', system_prompt=WAN_LM_ZH_SYS_PROMPT)
537
+ print('wan_zh:', ret_data)
538
+ ret_data = anno_ins.forward('a boy', system_prompt=WAN_LM_EN_SYS_PROMPT)
539
+ print('wan_en:', ret_data)
540
+ ret_data = anno_ins.forward('a boy', system_prompt=WAN_LM_ZH_SYS_PROMPT)
541
+ print('wan_zh en:', ret_data)
542
+ ret_data = anno_ins.forward('a boy', system_prompt=LTX_LM_EN_SYS_PROMPT)
543
+ print('ltx_en:', ret_data)
544
+
545
+ from vace.annotators.utils import get_annotator
546
+ anno_ins = get_annotator(config_type='prompt', config_task='ltx_en', return_dict=False)
547
+ ret_data = anno_ins.forward('a boy', seed=2025)
548
+ print('ltx_en:', ret_data)
549
+ ret_data = anno_ins.forward('a boy')
550
+ print('ltx_en:', ret_data)
551
+ ret_data = anno_ins.forward('a boy', seed=2025)
552
+ print('ltx_en:', ret_data)
553
+
554
+ @unittest.skip('')
555
+ def test_annotator_prompt_extend_ds(self):
556
+ from vace.annotators.utils import get_annotator
557
+ # export DASH_API_KEY=''
558
+ anno_ins = get_annotator(config_type='prompt', config_task='wan_zh_ds', return_dict=False)
559
+ ret_data = anno_ins.forward('一位男孩', seed=2025)
560
+ print('wan_zh_ds:', ret_data)
561
+ ret_data = anno_ins.forward('a boy', seed=2025)
562
+ print('wan_zh_ds:', ret_data)
563
+
564
+
565
+ # ln -s your/path/annotator_models annotator_models
566
+ # PYTHONPATH=. python tests/test_annotators.py
567
+ if __name__ == '__main__':
568
+ unittest.main()
vace/__init__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ from . import annotators
4
+ from . import configs
5
+ from . import models
6
+ from . import gradios
vace/annotators/__init__.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ from .depth import DepthAnnotator, DepthVideoAnnotator, DepthV2VideoAnnotator
4
+ from .flow import FlowAnnotator, FlowVisAnnotator
5
+ from .frameref import FrameRefExtractAnnotator, FrameRefExpandAnnotator
6
+ from .gdino import GDINOAnnotator, GDINORAMAnnotator
7
+ from .gray import GrayAnnotator, GrayVideoAnnotator
8
+ from .inpainting import InpaintingAnnotator, InpaintingVideoAnnotator
9
+ from .layout import LayoutBboxAnnotator, LayoutMaskAnnotator, LayoutTrackAnnotator
10
+ from .maskaug import MaskAugAnnotator
11
+ from .outpainting import OutpaintingAnnotator, OutpaintingInnerAnnotator, OutpaintingVideoAnnotator, OutpaintingInnerVideoAnnotator
12
+ from .pose import PoseBodyFaceAnnotator, PoseBodyFaceVideoAnnotator, PoseAnnotator, PoseBodyVideoAnnotator, PoseBodyAnnotator
13
+ from .ram import RAMAnnotator
14
+ from .salient import SalientAnnotator, SalientVideoAnnotator
15
+ from .sam import SAMImageAnnotator
16
+ from .sam2 import SAM2ImageAnnotator, SAM2VideoAnnotator, SAM2SalientVideoAnnotator, SAM2GDINOVideoAnnotator
17
+ from .scribble import ScribbleAnnotator, ScribbleVideoAnnotator
18
+ from .face import FaceAnnotator
19
+ from .subject import SubjectAnnotator
20
+ from .common import PlainImageAnnotator, PlainMaskAnnotator, PlainMaskAugAnnotator, PlainMaskVideoAnnotator, PlainVideoAnnotator, PlainMaskAugVideoAnnotator, PlainMaskAugInvertAnnotator, PlainMaskAugInvertVideoAnnotator, ExpandMaskVideoAnnotator
21
+ from .prompt_extend import PromptExtendAnnotator
22
+ from .composition import CompositionAnnotator, ReferenceAnythingAnnotator, AnimateAnythingAnnotator, SwapAnythingAnnotator, ExpandAnythingAnnotator, MoveAnythingAnnotator
23
+ from .mask import MaskDrawAnnotator
24
+ from .canvas import RegionCanvasAnnotator
vace/annotators/canvas.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import random
4
+
5
+ import cv2
6
+ import numpy as np
7
+
8
+ from .utils import convert_to_numpy
9
+
10
+
11
+ class RegionCanvasAnnotator:
12
+ def __init__(self, cfg, device=None):
13
+ self.scale_range = cfg.get('SCALE_RANGE', [0.75, 1.0])
14
+ self.canvas_value = cfg.get('CANVAS_VALUE', 255)
15
+ self.use_resize = cfg.get('USE_RESIZE', True)
16
+ self.use_canvas = cfg.get('USE_CANVAS', True)
17
+ self.use_aug = cfg.get('USE_AUG', False)
18
+ if self.use_aug:
19
+ from .maskaug import MaskAugAnnotator
20
+ self.maskaug_anno = MaskAugAnnotator(cfg={})
21
+
22
+ def forward(self, image, mask, mask_cfg=None):
23
+
24
+ image = convert_to_numpy(image)
25
+ mask = convert_to_numpy(mask)
26
+ image_h, image_w = image.shape[:2]
27
+
28
+ if self.use_aug:
29
+ mask = self.maskaug_anno.forward(mask, mask_cfg)
30
+
31
+ # get region with white bg
32
+ image[np.array(mask) == 0] = self.canvas_value
33
+ x, y, w, h = cv2.boundingRect(mask)
34
+ region_crop = image[y:y + h, x:x + w]
35
+
36
+ if self.use_resize:
37
+ # resize region
38
+ scale_min, scale_max = self.scale_range
39
+ scale_factor = random.uniform(scale_min, scale_max)
40
+ new_w, new_h = int(image_w * scale_factor), int(image_h * scale_factor)
41
+ obj_scale_factor = min(new_w/w, new_h/h)
42
+
43
+ new_w = int(w * obj_scale_factor)
44
+ new_h = int(h * obj_scale_factor)
45
+ region_crop_resized = cv2.resize(region_crop, (new_w, new_h), interpolation=cv2.INTER_AREA)
46
+ else:
47
+ region_crop_resized = region_crop
48
+
49
+ if self.use_canvas:
50
+ # plot region into canvas
51
+ new_canvas = np.ones_like(image) * self.canvas_value
52
+ max_x = max(0, image_w - new_w)
53
+ max_y = max(0, image_h - new_h)
54
+ new_x = random.randint(0, max_x)
55
+ new_y = random.randint(0, max_y)
56
+
57
+ new_canvas[new_y:new_y + new_h, new_x:new_x + new_w] = region_crop_resized
58
+ else:
59
+ new_canvas = region_crop_resized
60
+ return new_canvas
vace/annotators/common.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+
4
+ class PlainImageAnnotator:
5
+ def __init__(self, cfg):
6
+ pass
7
+ def forward(self, image):
8
+ return image
9
+
10
+ class PlainVideoAnnotator:
11
+ def __init__(self, cfg):
12
+ pass
13
+ def forward(self, frames):
14
+ return frames
15
+
16
+ class PlainMaskAnnotator:
17
+ def __init__(self, cfg):
18
+ pass
19
+ def forward(self, mask):
20
+ return mask
21
+
22
+ class PlainMaskAugInvertAnnotator:
23
+ def __init__(self, cfg):
24
+ pass
25
+ def forward(self, mask):
26
+ return 255 - mask
27
+
28
+ class PlainMaskAugAnnotator:
29
+ def __init__(self, cfg):
30
+ pass
31
+ def forward(self, mask):
32
+ return mask
33
+
34
+ class PlainMaskVideoAnnotator:
35
+ def __init__(self, cfg):
36
+ pass
37
+ def forward(self, mask):
38
+ return mask
39
+
40
+ class PlainMaskAugVideoAnnotator:
41
+ def __init__(self, cfg):
42
+ pass
43
+ def forward(self, masks):
44
+ return masks
45
+
46
+ class PlainMaskAugInvertVideoAnnotator:
47
+ def __init__(self, cfg):
48
+ pass
49
+ def forward(self, masks):
50
+ return [255 - mask for mask in masks]
51
+
52
+ class ExpandMaskVideoAnnotator:
53
+ def __init__(self, cfg):
54
+ pass
55
+ def forward(self, mask, expand_num):
56
+ return [mask] * expand_num
57
+
58
+ class PlainPromptAnnotator:
59
+ def __init__(self, cfg):
60
+ pass
61
+ def forward(self, prompt):
62
+ return prompt
vace/annotators/composition.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import numpy as np
4
+
5
+ class CompositionAnnotator:
6
+ def __init__(self, cfg):
7
+ self.process_types = ["repaint", "extension", "control"]
8
+ self.process_map = {
9
+ "repaint": "repaint",
10
+ "extension": "extension",
11
+ "control": "control",
12
+ "inpainting": "repaint",
13
+ "outpainting": "repaint",
14
+ "frameref": "extension",
15
+ "clipref": "extension",
16
+ "depth": "control",
17
+ "flow": "control",
18
+ "gray": "control",
19
+ "pose": "control",
20
+ "scribble": "control",
21
+ "layout": "control"
22
+ }
23
+
24
+ def forward(self, process_type_1, process_type_2, frames_1, frames_2, masks_1, masks_2):
25
+ total_frames = min(len(frames_1), len(frames_2), len(masks_1), len(masks_2))
26
+ combine_type = (self.process_map[process_type_1], self.process_map[process_type_2])
27
+ if combine_type in [("extension", "repaint"), ("extension", "control"), ("extension", "extension")]:
28
+ output_video = [frames_2[i] * masks_1[i] + frames_1[i] * (1 - masks_1[i]) for i in range(total_frames)]
29
+ output_mask = [masks_1[i] * masks_2[i] * 255 for i in range(total_frames)]
30
+ elif combine_type in [("repaint", "extension"), ("control", "extension"), ("repaint", "repaint")]:
31
+ output_video = [frames_1[i] * (1 - masks_2[i]) + frames_2[i] * masks_2[i] for i in range(total_frames)]
32
+ output_mask = [(masks_1[i] * (1 - masks_2[i]) + masks_2[i] * masks_2[i]) * 255 for i in range(total_frames)]
33
+ elif combine_type in [("repaint", "control"), ("control", "repaint")]:
34
+ if combine_type in [("control", "repaint")]:
35
+ frames_1, frames_2, masks_1, masks_2 = frames_2, frames_1, masks_2, masks_1
36
+ output_video = [frames_1[i] * (1 - masks_1[i]) + frames_2[i] * masks_1[i] for i in range(total_frames)]
37
+ output_mask = [masks_1[i] * 255 for i in range(total_frames)]
38
+ elif combine_type in [("control", "control")]: # apply masks_2
39
+ output_video = [frames_1[i] * (1 - masks_2[i]) + frames_2[i] * masks_2[i] for i in range(total_frames)]
40
+ output_mask = [(masks_1[i] * (1 - masks_2[i]) + masks_2[i] * masks_2[i]) * 255 for i in range(total_frames)]
41
+ else:
42
+ raise Exception("Unknown combine type")
43
+ return output_video, output_mask
44
+
45
+
46
+ class ReferenceAnythingAnnotator:
47
+ def __init__(self, cfg):
48
+ from .subject import SubjectAnnotator
49
+ self.sbjref_ins = SubjectAnnotator(cfg['SUBJECT'] if 'SUBJECT' in cfg else cfg)
50
+ self.key_map = {
51
+ "image": "images",
52
+ "mask": "masks"
53
+ }
54
+ def forward(self, images, mode=None, return_mask=None, mask_cfg=None):
55
+ ret_data = {}
56
+ for image in images:
57
+ ret_one_data = self.sbjref_ins.forward(image=image, mode=mode, return_mask=return_mask, mask_cfg=mask_cfg)
58
+ if isinstance(ret_one_data, dict):
59
+ for key, val in ret_one_data.items():
60
+ if key in self.key_map:
61
+ new_key = self.key_map[key]
62
+ else:
63
+ continue
64
+ if new_key in ret_data:
65
+ ret_data[new_key].append(val)
66
+ else:
67
+ ret_data[new_key] = [val]
68
+ else:
69
+ if 'images' in ret_data:
70
+ ret_data['images'].append(ret_data)
71
+ else:
72
+ ret_data['images'] = [ret_data]
73
+ return ret_data
74
+
75
+
76
+ class AnimateAnythingAnnotator:
77
+ def __init__(self, cfg):
78
+ from .pose import PoseBodyFaceVideoAnnotator
79
+ self.pose_ins = PoseBodyFaceVideoAnnotator(cfg['POSE'])
80
+ self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
81
+
82
+ def forward(self, frames=None, images=None, mode=None, return_mask=None, mask_cfg=None):
83
+ ret_data = {}
84
+ ret_pose_data = self.pose_ins.forward(frames=frames)
85
+ ret_data.update({"frames": ret_pose_data})
86
+
87
+ ret_ref_data = self.ref_ins.forward(images=images, mode=mode, return_mask=return_mask, mask_cfg=mask_cfg)
88
+ ret_data.update({"images": ret_ref_data['images']})
89
+
90
+ return ret_data
91
+
92
+
93
+ class SwapAnythingAnnotator:
94
+ def __init__(self, cfg):
95
+ from .inpainting import InpaintingVideoAnnotator
96
+ self.inp_ins = InpaintingVideoAnnotator(cfg['INPAINTING'])
97
+ self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
98
+
99
+ def forward(self, video=None, frames=None, images=None, mode=None, mask=None, bbox=None, label=None, caption=None, return_mask=None, mask_cfg=None):
100
+ ret_data = {}
101
+ mode = mode.split(',') if ',' in mode else [mode, mode]
102
+
103
+ ret_inp_data = self.inp_ins.forward(video=video, frames=frames, mode=mode[0], mask=mask, bbox=bbox, label=label, caption=caption, mask_cfg=mask_cfg)
104
+ ret_data.update(ret_inp_data)
105
+
106
+ ret_ref_data = self.ref_ins.forward(images=images, mode=mode[1], return_mask=return_mask, mask_cfg=mask_cfg)
107
+ ret_data.update({"images": ret_ref_data['images']})
108
+
109
+ return ret_data
110
+
111
+
112
+ class ExpandAnythingAnnotator:
113
+ def __init__(self, cfg):
114
+ from .outpainting import OutpaintingAnnotator
115
+ from .frameref import FrameRefExpandAnnotator
116
+ self.ref_ins = ReferenceAnythingAnnotator(cfg['REFERENCE'])
117
+ self.frameref_ins = FrameRefExpandAnnotator(cfg['FRAMEREF'])
118
+ self.outpainting_ins = OutpaintingAnnotator(cfg['OUTPAINTING'])
119
+
120
+ def forward(self, images=None, mode=None, return_mask=None, mask_cfg=None, direction=None, expand_ratio=None, expand_num=None):
121
+ ret_data = {}
122
+ expand_image, reference_image= images[0], images[1:]
123
+ mode = mode.split(',') if ',' in mode else ['firstframe', mode]
124
+
125
+ outpainting_data = self.outpainting_ins.forward(expand_image,expand_ratio=expand_ratio, direction=direction)
126
+ outpainting_image, outpainting_mask = outpainting_data['image'], outpainting_data['mask']
127
+
128
+ frameref_data = self.frameref_ins.forward(outpainting_image, mode=mode[0], expand_num=expand_num)
129
+ frames, masks = frameref_data['frames'], frameref_data['masks']
130
+ masks[0] = outpainting_mask
131
+ ret_data.update({"frames": frames, "masks": masks})
132
+
133
+ ret_ref_data = self.ref_ins.forward(images=reference_image, mode=mode[1], return_mask=return_mask, mask_cfg=mask_cfg)
134
+ ret_data.update({"images": ret_ref_data['images']})
135
+
136
+ return ret_data
137
+
138
+
139
+ class MoveAnythingAnnotator:
140
+ def __init__(self, cfg):
141
+ from .layout import LayoutBboxAnnotator
142
+ self.layout_bbox_ins = LayoutBboxAnnotator(cfg['LAYOUTBBOX'])
143
+
144
+ def forward(self, image=None, bbox=None, label=None, expand_num=None):
145
+ frame_size = image.shape[:2] # [H, W]
146
+ ret_layout_data = self.layout_bbox_ins.forward(bbox, frame_size=frame_size, num_frames=expand_num, label=label)
147
+
148
+ out_frames = [image] + ret_layout_data
149
+ out_mask = [np.zeros(frame_size, dtype=np.uint8)] + [np.ones(frame_size, dtype=np.uint8) * 255] * len(ret_layout_data)
150
+
151
+ ret_data = {
152
+ "frames": out_frames,
153
+ "masks": out_mask
154
+ }
155
+ return ret_data
vace/annotators/depth.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import numpy as np
4
+ import torch
5
+ from einops import rearrange
6
+
7
+ from .utils import convert_to_numpy, resize_image, resize_image_ori
8
+
9
+ class DepthAnnotator:
10
+ def __init__(self, cfg, device=None):
11
+ from .midas.api import MiDaSInference
12
+ pretrained_model = cfg['PRETRAINED_MODEL']
13
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
14
+ self.model = MiDaSInference(model_type='dpt_hybrid', model_path=pretrained_model).to(self.device)
15
+ self.a = cfg.get('A', np.pi * 2.0)
16
+ self.bg_th = cfg.get('BG_TH', 0.1)
17
+
18
+ @torch.no_grad()
19
+ @torch.inference_mode()
20
+ @torch.autocast('cuda', enabled=False)
21
+ def forward(self, image):
22
+ image = convert_to_numpy(image)
23
+ image_depth = image
24
+ h, w, c = image.shape
25
+ image_depth, k = resize_image(image_depth,
26
+ 1024 if min(h, w) > 1024 else min(h, w))
27
+ image_depth = torch.from_numpy(image_depth).float().to(self.device)
28
+ image_depth = image_depth / 127.5 - 1.0
29
+ image_depth = rearrange(image_depth, 'h w c -> 1 c h w')
30
+ depth = self.model(image_depth)[0]
31
+
32
+ depth_pt = depth.clone()
33
+ depth_pt -= torch.min(depth_pt)
34
+ depth_pt /= torch.max(depth_pt)
35
+ depth_pt = depth_pt.cpu().numpy()
36
+ depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
37
+ depth_image = depth_image[..., None].repeat(3, 2)
38
+
39
+ depth_image = resize_image_ori(h, w, depth_image, k)
40
+ return depth_image
41
+
42
+
43
+ class DepthVideoAnnotator(DepthAnnotator):
44
+ def forward(self, frames):
45
+ ret_frames = []
46
+ for frame in frames:
47
+ anno_frame = super().forward(np.array(frame))
48
+ ret_frames.append(anno_frame)
49
+ return ret_frames
50
+
51
+
52
+ class DepthV2Annotator:
53
+ def __init__(self, cfg, device=None):
54
+ from .depth_anything_v2.dpt import DepthAnythingV2
55
+ pretrained_model = cfg['PRETRAINED_MODEL']
56
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if device is None else device
57
+ self.model = DepthAnythingV2(encoder='vitl', features=256, out_channels=[256, 512, 1024, 1024]).to(self.device)
58
+ self.model.load_state_dict(
59
+ torch.load(
60
+ pretrained_model,
61
+ map_location=self.device
62
+ )
63
+ )
64
+ self.model.eval()
65
+
66
+ @torch.inference_mode()
67
+ @torch.autocast('cuda', enabled=False)
68
+ def forward(self, image):
69
+ image = convert_to_numpy(image)
70
+ depth = self.model.infer_image(image)
71
+
72
+ depth_pt = depth.copy()
73
+ depth_pt -= np.min(depth_pt)
74
+ depth_pt /= np.max(depth_pt)
75
+ depth_image = (depth_pt * 255.0).clip(0, 255).astype(np.uint8)
76
+
77
+ depth_image = depth_image[..., np.newaxis]
78
+ depth_image = np.repeat(depth_image, 3, axis=2)
79
+ return depth_image
80
+
81
+
82
+ class DepthV2VideoAnnotator(DepthV2Annotator):
83
+ def forward(self, frames):
84
+ ret_frames = []
85
+ for frame in frames:
86
+ anno_frame = super().forward(np.array(frame))
87
+ ret_frames.append(anno_frame)
88
+ return ret_frames
vace/annotators/depth_anything_v2/__init__.py ADDED
File without changes
vace/annotators/depth_anything_v2/dinov2.py ADDED
@@ -0,0 +1,414 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ #
3
+ # This source code is licensed under the Apache License, Version 2.0
4
+ # found in the LICENSE file in the root directory of this source tree.
5
+
6
+ # References:
7
+ # https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
8
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
9
+
10
+ from functools import partial
11
+ import math
12
+ import logging
13
+ from typing import Sequence, Tuple, Union, Callable
14
+
15
+ import torch
16
+ import torch.nn as nn
17
+ import torch.utils.checkpoint
18
+ from torch.nn.init import trunc_normal_
19
+
20
+ from .layers import Mlp, PatchEmbed, SwiGLUFFNFused, MemEffAttention, NestedTensorBlock as Block
21
+
22
+ logger = logging.getLogger("dinov2")
23
+
24
+
25
+ def named_apply(fn: Callable, module: nn.Module, name="", depth_first=True, include_root=False) -> nn.Module:
26
+ if not depth_first and include_root:
27
+ fn(module=module, name=name)
28
+ for child_name, child_module in module.named_children():
29
+ child_name = ".".join((name, child_name)) if name else child_name
30
+ named_apply(fn=fn, module=child_module, name=child_name, depth_first=depth_first, include_root=True)
31
+ if depth_first and include_root:
32
+ fn(module=module, name=name)
33
+ return module
34
+
35
+
36
+ class BlockChunk(nn.ModuleList):
37
+ def forward(self, x):
38
+ for b in self:
39
+ x = b(x)
40
+ return x
41
+
42
+
43
+ class DinoVisionTransformer(nn.Module):
44
+ def __init__(
45
+ self,
46
+ img_size=224,
47
+ patch_size=16,
48
+ in_chans=3,
49
+ embed_dim=768,
50
+ depth=12,
51
+ num_heads=12,
52
+ mlp_ratio=4.0,
53
+ qkv_bias=True,
54
+ ffn_bias=True,
55
+ proj_bias=True,
56
+ drop_path_rate=0.0,
57
+ drop_path_uniform=False,
58
+ init_values=None, # for layerscale: None or 0 => no layerscale
59
+ embed_layer=PatchEmbed,
60
+ act_layer=nn.GELU,
61
+ block_fn=Block,
62
+ ffn_layer="mlp",
63
+ block_chunks=1,
64
+ num_register_tokens=0,
65
+ interpolate_antialias=False,
66
+ interpolate_offset=0.1,
67
+ ):
68
+ """
69
+ Args:
70
+ img_size (int, tuple): input image size
71
+ patch_size (int, tuple): patch size
72
+ in_chans (int): number of input channels
73
+ embed_dim (int): embedding dimension
74
+ depth (int): depth of transformer
75
+ num_heads (int): number of attention heads
76
+ mlp_ratio (int): ratio of mlp hidden dim to embedding dim
77
+ qkv_bias (bool): enable bias for qkv if True
78
+ proj_bias (bool): enable bias for proj in attn if True
79
+ ffn_bias (bool): enable bias for ffn if True
80
+ drop_path_rate (float): stochastic depth rate
81
+ drop_path_uniform (bool): apply uniform drop rate across blocks
82
+ weight_init (str): weight init scheme
83
+ init_values (float): layer-scale init values
84
+ embed_layer (nn.Module): patch embedding layer
85
+ act_layer (nn.Module): MLP activation layer
86
+ block_fn (nn.Module): transformer block class
87
+ ffn_layer (str): "mlp", "swiglu", "swiglufused" or "identity"
88
+ block_chunks: (int) split block sequence into block_chunks units for FSDP wrap
89
+ num_register_tokens: (int) number of extra cls tokens (so-called "registers")
90
+ interpolate_antialias: (str) flag to apply anti-aliasing when interpolating positional embeddings
91
+ interpolate_offset: (float) work-around offset to apply when interpolating positional embeddings
92
+ """
93
+ super().__init__()
94
+ norm_layer = partial(nn.LayerNorm, eps=1e-6)
95
+
96
+ self.num_features = self.embed_dim = embed_dim # num_features for consistency with other models
97
+ self.num_tokens = 1
98
+ self.n_blocks = depth
99
+ self.num_heads = num_heads
100
+ self.patch_size = patch_size
101
+ self.num_register_tokens = num_register_tokens
102
+ self.interpolate_antialias = interpolate_antialias
103
+ self.interpolate_offset = interpolate_offset
104
+
105
+ self.patch_embed = embed_layer(img_size=img_size, patch_size=patch_size, in_chans=in_chans, embed_dim=embed_dim)
106
+ num_patches = self.patch_embed.num_patches
107
+
108
+ self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
109
+ self.pos_embed = nn.Parameter(torch.zeros(1, num_patches + self.num_tokens, embed_dim))
110
+ assert num_register_tokens >= 0
111
+ self.register_tokens = (
112
+ nn.Parameter(torch.zeros(1, num_register_tokens, embed_dim)) if num_register_tokens else None
113
+ )
114
+
115
+ if drop_path_uniform is True:
116
+ dpr = [drop_path_rate] * depth
117
+ else:
118
+ dpr = [x.item() for x in torch.linspace(0, drop_path_rate, depth)] # stochastic depth decay rule
119
+
120
+ if ffn_layer == "mlp":
121
+ logger.info("using MLP layer as FFN")
122
+ ffn_layer = Mlp
123
+ elif ffn_layer == "swiglufused" or ffn_layer == "swiglu":
124
+ logger.info("using SwiGLU layer as FFN")
125
+ ffn_layer = SwiGLUFFNFused
126
+ elif ffn_layer == "identity":
127
+ logger.info("using Identity layer as FFN")
128
+
129
+ def f(*args, **kwargs):
130
+ return nn.Identity()
131
+
132
+ ffn_layer = f
133
+ else:
134
+ raise NotImplementedError
135
+
136
+ blocks_list = [
137
+ block_fn(
138
+ dim=embed_dim,
139
+ num_heads=num_heads,
140
+ mlp_ratio=mlp_ratio,
141
+ qkv_bias=qkv_bias,
142
+ proj_bias=proj_bias,
143
+ ffn_bias=ffn_bias,
144
+ drop_path=dpr[i],
145
+ norm_layer=norm_layer,
146
+ act_layer=act_layer,
147
+ ffn_layer=ffn_layer,
148
+ init_values=init_values,
149
+ )
150
+ for i in range(depth)
151
+ ]
152
+ if block_chunks > 0:
153
+ self.chunked_blocks = True
154
+ chunked_blocks = []
155
+ chunksize = depth // block_chunks
156
+ for i in range(0, depth, chunksize):
157
+ # this is to keep the block index consistent if we chunk the block list
158
+ chunked_blocks.append([nn.Identity()] * i + blocks_list[i: i + chunksize])
159
+ self.blocks = nn.ModuleList([BlockChunk(p) for p in chunked_blocks])
160
+ else:
161
+ self.chunked_blocks = False
162
+ self.blocks = nn.ModuleList(blocks_list)
163
+
164
+ self.norm = norm_layer(embed_dim)
165
+ self.head = nn.Identity()
166
+
167
+ self.mask_token = nn.Parameter(torch.zeros(1, embed_dim))
168
+
169
+ self.init_weights()
170
+
171
+ def init_weights(self):
172
+ trunc_normal_(self.pos_embed, std=0.02)
173
+ nn.init.normal_(self.cls_token, std=1e-6)
174
+ if self.register_tokens is not None:
175
+ nn.init.normal_(self.register_tokens, std=1e-6)
176
+ named_apply(init_weights_vit_timm, self)
177
+
178
+ def interpolate_pos_encoding(self, x, w, h):
179
+ previous_dtype = x.dtype
180
+ npatch = x.shape[1] - 1
181
+ N = self.pos_embed.shape[1] - 1
182
+ if npatch == N and w == h:
183
+ return self.pos_embed
184
+ pos_embed = self.pos_embed.float()
185
+ class_pos_embed = pos_embed[:, 0]
186
+ patch_pos_embed = pos_embed[:, 1:]
187
+ dim = x.shape[-1]
188
+ w0 = w // self.patch_size
189
+ h0 = h // self.patch_size
190
+ # we add a small number to avoid floating point error in the interpolation
191
+ # see discussion at https://github.com/facebookresearch/dino/issues/8
192
+ # DINOv2 with register modify the interpolate_offset from 0.1 to 0.0
193
+ w0, h0 = w0 + self.interpolate_offset, h0 + self.interpolate_offset
194
+ # w0, h0 = w0 + 0.1, h0 + 0.1
195
+
196
+ sqrt_N = math.sqrt(N)
197
+ sx, sy = float(w0) / sqrt_N, float(h0) / sqrt_N
198
+ patch_pos_embed = nn.functional.interpolate(
199
+ patch_pos_embed.reshape(1, int(sqrt_N), int(sqrt_N), dim).permute(0, 3, 1, 2),
200
+ scale_factor=(sx, sy),
201
+ # (int(w0), int(h0)), # to solve the upsampling shape issue
202
+ mode="bicubic",
203
+ antialias=self.interpolate_antialias
204
+ )
205
+
206
+ assert int(w0) == patch_pos_embed.shape[-2]
207
+ assert int(h0) == patch_pos_embed.shape[-1]
208
+ patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
209
+ return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1).to(previous_dtype)
210
+
211
+ def prepare_tokens_with_masks(self, x, masks=None):
212
+ B, nc, w, h = x.shape
213
+ x = self.patch_embed(x)
214
+ if masks is not None:
215
+ x = torch.where(masks.unsqueeze(-1), self.mask_token.to(x.dtype).unsqueeze(0), x)
216
+
217
+ x = torch.cat((self.cls_token.expand(x.shape[0], -1, -1), x), dim=1)
218
+ x = x + self.interpolate_pos_encoding(x, w, h)
219
+
220
+ if self.register_tokens is not None:
221
+ x = torch.cat(
222
+ (
223
+ x[:, :1],
224
+ self.register_tokens.expand(x.shape[0], -1, -1),
225
+ x[:, 1:],
226
+ ),
227
+ dim=1,
228
+ )
229
+
230
+ return x
231
+
232
+ def forward_features_list(self, x_list, masks_list):
233
+ x = [self.prepare_tokens_with_masks(x, masks) for x, masks in zip(x_list, masks_list)]
234
+ for blk in self.blocks:
235
+ x = blk(x)
236
+
237
+ all_x = x
238
+ output = []
239
+ for x, masks in zip(all_x, masks_list):
240
+ x_norm = self.norm(x)
241
+ output.append(
242
+ {
243
+ "x_norm_clstoken": x_norm[:, 0],
244
+ "x_norm_regtokens": x_norm[:, 1: self.num_register_tokens + 1],
245
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1:],
246
+ "x_prenorm": x,
247
+ "masks": masks,
248
+ }
249
+ )
250
+ return output
251
+
252
+ def forward_features(self, x, masks=None):
253
+ if isinstance(x, list):
254
+ return self.forward_features_list(x, masks)
255
+
256
+ x = self.prepare_tokens_with_masks(x, masks)
257
+
258
+ for blk in self.blocks:
259
+ x = blk(x)
260
+
261
+ x_norm = self.norm(x)
262
+ return {
263
+ "x_norm_clstoken": x_norm[:, 0],
264
+ "x_norm_regtokens": x_norm[:, 1: self.num_register_tokens + 1],
265
+ "x_norm_patchtokens": x_norm[:, self.num_register_tokens + 1:],
266
+ "x_prenorm": x,
267
+ "masks": masks,
268
+ }
269
+
270
+ def _get_intermediate_layers_not_chunked(self, x, n=1):
271
+ x = self.prepare_tokens_with_masks(x)
272
+ # If n is an int, take the n last blocks. If it's a list, take them
273
+ output, total_block_len = [], len(self.blocks)
274
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
275
+ for i, blk in enumerate(self.blocks):
276
+ x = blk(x)
277
+ if i in blocks_to_take:
278
+ output.append(x)
279
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
280
+ return output
281
+
282
+ def _get_intermediate_layers_chunked(self, x, n=1):
283
+ x = self.prepare_tokens_with_masks(x)
284
+ output, i, total_block_len = [], 0, len(self.blocks[-1])
285
+ # If n is an int, take the n last blocks. If it's a list, take them
286
+ blocks_to_take = range(total_block_len - n, total_block_len) if isinstance(n, int) else n
287
+ for block_chunk in self.blocks:
288
+ for blk in block_chunk[i:]: # Passing the nn.Identity()
289
+ x = blk(x)
290
+ if i in blocks_to_take:
291
+ output.append(x)
292
+ i += 1
293
+ assert len(output) == len(blocks_to_take), f"only {len(output)} / {len(blocks_to_take)} blocks found"
294
+ return output
295
+
296
+ def get_intermediate_layers(
297
+ self,
298
+ x: torch.Tensor,
299
+ n: Union[int, Sequence] = 1, # Layers or n last layers to take
300
+ reshape: bool = False,
301
+ return_class_token: bool = False,
302
+ norm=True
303
+ ) -> Tuple[Union[torch.Tensor, Tuple[torch.Tensor]]]:
304
+ if self.chunked_blocks:
305
+ outputs = self._get_intermediate_layers_chunked(x, n)
306
+ else:
307
+ outputs = self._get_intermediate_layers_not_chunked(x, n)
308
+ if norm:
309
+ outputs = [self.norm(out) for out in outputs]
310
+ class_tokens = [out[:, 0] for out in outputs]
311
+ outputs = [out[:, 1 + self.num_register_tokens:] for out in outputs]
312
+ if reshape:
313
+ B, _, w, h = x.shape
314
+ outputs = [
315
+ out.reshape(B, w // self.patch_size, h // self.patch_size, -1).permute(0, 3, 1, 2).contiguous()
316
+ for out in outputs
317
+ ]
318
+ if return_class_token:
319
+ return tuple(zip(outputs, class_tokens))
320
+ return tuple(outputs)
321
+
322
+ def forward(self, *args, is_training=False, **kwargs):
323
+ ret = self.forward_features(*args, **kwargs)
324
+ if is_training:
325
+ return ret
326
+ else:
327
+ return self.head(ret["x_norm_clstoken"])
328
+
329
+
330
+ def init_weights_vit_timm(module: nn.Module, name: str = ""):
331
+ """ViT weight initialization, original timm impl (for reproducibility)"""
332
+ if isinstance(module, nn.Linear):
333
+ trunc_normal_(module.weight, std=0.02)
334
+ if module.bias is not None:
335
+ nn.init.zeros_(module.bias)
336
+
337
+
338
+ def vit_small(patch_size=16, num_register_tokens=0, **kwargs):
339
+ model = DinoVisionTransformer(
340
+ patch_size=patch_size,
341
+ embed_dim=384,
342
+ depth=12,
343
+ num_heads=6,
344
+ mlp_ratio=4,
345
+ block_fn=partial(Block, attn_class=MemEffAttention),
346
+ num_register_tokens=num_register_tokens,
347
+ **kwargs,
348
+ )
349
+ return model
350
+
351
+
352
+ def vit_base(patch_size=16, num_register_tokens=0, **kwargs):
353
+ model = DinoVisionTransformer(
354
+ patch_size=patch_size,
355
+ embed_dim=768,
356
+ depth=12,
357
+ num_heads=12,
358
+ mlp_ratio=4,
359
+ block_fn=partial(Block, attn_class=MemEffAttention),
360
+ num_register_tokens=num_register_tokens,
361
+ **kwargs,
362
+ )
363
+ return model
364
+
365
+
366
+ def vit_large(patch_size=16, num_register_tokens=0, **kwargs):
367
+ model = DinoVisionTransformer(
368
+ patch_size=patch_size,
369
+ embed_dim=1024,
370
+ depth=24,
371
+ num_heads=16,
372
+ mlp_ratio=4,
373
+ block_fn=partial(Block, attn_class=MemEffAttention),
374
+ num_register_tokens=num_register_tokens,
375
+ **kwargs,
376
+ )
377
+ return model
378
+
379
+
380
+ def vit_giant2(patch_size=16, num_register_tokens=0, **kwargs):
381
+ """
382
+ Close to ViT-giant, with embed-dim 1536 and 24 heads => embed-dim per head 64
383
+ """
384
+ model = DinoVisionTransformer(
385
+ patch_size=patch_size,
386
+ embed_dim=1536,
387
+ depth=40,
388
+ num_heads=24,
389
+ mlp_ratio=4,
390
+ block_fn=partial(Block, attn_class=MemEffAttention),
391
+ num_register_tokens=num_register_tokens,
392
+ **kwargs,
393
+ )
394
+ return model
395
+
396
+
397
+ def DINOv2(model_name):
398
+ model_zoo = {
399
+ "vits": vit_small,
400
+ "vitb": vit_base,
401
+ "vitl": vit_large,
402
+ "vitg": vit_giant2
403
+ }
404
+
405
+ return model_zoo[model_name](
406
+ img_size=518,
407
+ patch_size=14,
408
+ init_values=1.0,
409
+ ffn_layer="mlp" if model_name != "vitg" else "swiglufused",
410
+ block_chunks=0,
411
+ num_register_tokens=0,
412
+ interpolate_antialias=False,
413
+ interpolate_offset=0.1
414
+ )
vace/annotators/depth_anything_v2/dpt.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import cv2
4
+ import torch
5
+ import torch.nn as nn
6
+ import torch.nn.functional as F
7
+ from torchvision.transforms import Compose
8
+
9
+ from .dinov2 import DINOv2
10
+ from .util.blocks import FeatureFusionBlock, _make_scratch
11
+ from .util.transform import Resize, NormalizeImage, PrepareForNet
12
+
13
+
14
+ class DepthAnythingV2(nn.Module):
15
+ def __init__(
16
+ self,
17
+ encoder='vitl',
18
+ features=256,
19
+ out_channels=[256, 512, 1024, 1024],
20
+ use_bn=False,
21
+ use_clstoken=False
22
+ ):
23
+ super(DepthAnythingV2, self).__init__()
24
+
25
+ self.intermediate_layer_idx = {
26
+ 'vits': [2, 5, 8, 11],
27
+ 'vitb': [2, 5, 8, 11],
28
+ 'vitl': [4, 11, 17, 23],
29
+ 'vitg': [9, 19, 29, 39]
30
+ }
31
+
32
+ self.encoder = encoder
33
+ self.pretrained = DINOv2(model_name=encoder)
34
+
35
+ self.depth_head = DPTHead(self.pretrained.embed_dim, features, use_bn, out_channels=out_channels,
36
+ use_clstoken=use_clstoken)
37
+
38
+ def forward(self, x):
39
+ patch_h, patch_w = x.shape[-2] // 14, x.shape[-1] // 14
40
+
41
+ features = self.pretrained.get_intermediate_layers(x, self.intermediate_layer_idx[self.encoder],
42
+ return_class_token=True)
43
+
44
+ depth = self.depth_head(features, patch_h, patch_w)
45
+ depth = F.relu(depth)
46
+
47
+ return depth.squeeze(1)
48
+
49
+ @torch.no_grad()
50
+ def infer_image(self, raw_image, input_size=518):
51
+ image, (h, w) = self.image2tensor(raw_image, input_size)
52
+
53
+ depth = self.forward(image)
54
+ depth = F.interpolate(depth[:, None], (h, w), mode="bilinear", align_corners=True)[0, 0]
55
+
56
+ return depth.cpu().numpy()
57
+
58
+ def image2tensor(self, raw_image, input_size=518):
59
+ transform = Compose([
60
+ Resize(
61
+ width=input_size,
62
+ height=input_size,
63
+ resize_target=False,
64
+ keep_aspect_ratio=True,
65
+ ensure_multiple_of=14,
66
+ resize_method='lower_bound',
67
+ image_interpolation_method=cv2.INTER_CUBIC,
68
+ ),
69
+ NormalizeImage(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
70
+ PrepareForNet(),
71
+ ])
72
+
73
+ h, w = raw_image.shape[:2]
74
+
75
+ image = cv2.cvtColor(raw_image, cv2.COLOR_BGR2RGB) / 255.0
76
+
77
+ image = transform({'image': image})['image']
78
+ image = torch.from_numpy(image).unsqueeze(0)
79
+
80
+ DEVICE = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
81
+ image = image.to(DEVICE)
82
+
83
+ return image, (h, w)
84
+
85
+
86
+ class DPTHead(nn.Module):
87
+ def __init__(
88
+ self,
89
+ in_channels,
90
+ features=256,
91
+ use_bn=False,
92
+ out_channels=[256, 512, 1024, 1024],
93
+ use_clstoken=False
94
+ ):
95
+ super(DPTHead, self).__init__()
96
+
97
+ self.use_clstoken = use_clstoken
98
+
99
+ self.projects = nn.ModuleList([
100
+ nn.Conv2d(
101
+ in_channels=in_channels,
102
+ out_channels=out_channel,
103
+ kernel_size=1,
104
+ stride=1,
105
+ padding=0,
106
+ ) for out_channel in out_channels
107
+ ])
108
+
109
+ self.resize_layers = nn.ModuleList([
110
+ nn.ConvTranspose2d(
111
+ in_channels=out_channels[0],
112
+ out_channels=out_channels[0],
113
+ kernel_size=4,
114
+ stride=4,
115
+ padding=0),
116
+ nn.ConvTranspose2d(
117
+ in_channels=out_channels[1],
118
+ out_channels=out_channels[1],
119
+ kernel_size=2,
120
+ stride=2,
121
+ padding=0),
122
+ nn.Identity(),
123
+ nn.Conv2d(
124
+ in_channels=out_channels[3],
125
+ out_channels=out_channels[3],
126
+ kernel_size=3,
127
+ stride=2,
128
+ padding=1)
129
+ ])
130
+
131
+ if use_clstoken:
132
+ self.readout_projects = nn.ModuleList()
133
+ for _ in range(len(self.projects)):
134
+ self.readout_projects.append(
135
+ nn.Sequential(
136
+ nn.Linear(2 * in_channels, in_channels),
137
+ nn.GELU()))
138
+
139
+ self.scratch = _make_scratch(
140
+ out_channels,
141
+ features,
142
+ groups=1,
143
+ expand=False,
144
+ )
145
+
146
+ self.scratch.stem_transpose = None
147
+
148
+ self.scratch.refinenet1 = _make_fusion_block(features, use_bn)
149
+ self.scratch.refinenet2 = _make_fusion_block(features, use_bn)
150
+ self.scratch.refinenet3 = _make_fusion_block(features, use_bn)
151
+ self.scratch.refinenet4 = _make_fusion_block(features, use_bn)
152
+
153
+ head_features_1 = features
154
+ head_features_2 = 32
155
+
156
+ self.scratch.output_conv1 = nn.Conv2d(head_features_1, head_features_1 // 2, kernel_size=3, stride=1, padding=1)
157
+ self.scratch.output_conv2 = nn.Sequential(
158
+ nn.Conv2d(head_features_1 // 2, head_features_2, kernel_size=3, stride=1, padding=1),
159
+ nn.ReLU(True),
160
+ nn.Conv2d(head_features_2, 1, kernel_size=1, stride=1, padding=0),
161
+ nn.ReLU(True),
162
+ nn.Identity(),
163
+ )
164
+
165
+ def forward(self, out_features, patch_h, patch_w):
166
+ out = []
167
+ for i, x in enumerate(out_features):
168
+ if self.use_clstoken:
169
+ x, cls_token = x[0], x[1]
170
+ readout = cls_token.unsqueeze(1).expand_as(x)
171
+ x = self.readout_projects[i](torch.cat((x, readout), -1))
172
+ else:
173
+ x = x[0]
174
+
175
+ x = x.permute(0, 2, 1).reshape((x.shape[0], x.shape[-1], patch_h, patch_w))
176
+
177
+ x = self.projects[i](x)
178
+ x = self.resize_layers[i](x)
179
+
180
+ out.append(x)
181
+
182
+ layer_1, layer_2, layer_3, layer_4 = out
183
+
184
+ layer_1_rn = self.scratch.layer1_rn(layer_1)
185
+ layer_2_rn = self.scratch.layer2_rn(layer_2)
186
+ layer_3_rn = self.scratch.layer3_rn(layer_3)
187
+ layer_4_rn = self.scratch.layer4_rn(layer_4)
188
+
189
+ path_4 = self.scratch.refinenet4(layer_4_rn, size=layer_3_rn.shape[2:])
190
+ path_3 = self.scratch.refinenet3(path_4, layer_3_rn, size=layer_2_rn.shape[2:])
191
+ path_2 = self.scratch.refinenet2(path_3, layer_2_rn, size=layer_1_rn.shape[2:])
192
+ path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
193
+
194
+ out = self.scratch.output_conv1(path_1)
195
+ out = F.interpolate(out, (int(patch_h * 14), int(patch_w * 14)), mode="bilinear", align_corners=True)
196
+ out = self.scratch.output_conv2(out)
197
+
198
+ return out
199
+
200
+
201
+ def _make_fusion_block(features, use_bn, size=None):
202
+ return FeatureFusionBlock(
203
+ features,
204
+ nn.ReLU(False),
205
+ deconv=False,
206
+ bn=use_bn,
207
+ expand=False,
208
+ align_corners=True,
209
+ size=size,
210
+ )
vace/annotators/depth_anything_v2/layers/__init__.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ from .mlp import Mlp
8
+ from .patch_embed import PatchEmbed
9
+ from .swiglu_ffn import SwiGLUFFN, SwiGLUFFNFused
10
+ from .block import NestedTensorBlock
11
+ from .attention import MemEffAttention
vace/annotators/depth_anything_v2/layers/attention.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/models/vision_transformer.py
10
+
11
+ import logging
12
+
13
+ from torch import Tensor
14
+ from torch import nn
15
+
16
+ logger = logging.getLogger("dinov2")
17
+
18
+ try:
19
+ from xformers.ops import memory_efficient_attention, unbind, fmha
20
+
21
+ XFORMERS_AVAILABLE = True
22
+ except ImportError:
23
+ logger.warning("xFormers not available")
24
+ XFORMERS_AVAILABLE = False
25
+
26
+
27
+ class Attention(nn.Module):
28
+ def __init__(
29
+ self,
30
+ dim: int,
31
+ num_heads: int = 8,
32
+ qkv_bias: bool = False,
33
+ proj_bias: bool = True,
34
+ attn_drop: float = 0.0,
35
+ proj_drop: float = 0.0,
36
+ ) -> None:
37
+ super().__init__()
38
+ self.num_heads = num_heads
39
+ head_dim = dim // num_heads
40
+ self.scale = head_dim ** -0.5
41
+
42
+ self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
43
+ self.attn_drop = nn.Dropout(attn_drop)
44
+ self.proj = nn.Linear(dim, dim, bias=proj_bias)
45
+ self.proj_drop = nn.Dropout(proj_drop)
46
+
47
+ def forward(self, x: Tensor) -> Tensor:
48
+ B, N, C = x.shape
49
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
50
+
51
+ q, k, v = qkv[0] * self.scale, qkv[1], qkv[2]
52
+ attn = q @ k.transpose(-2, -1)
53
+
54
+ attn = attn.softmax(dim=-1)
55
+ attn = self.attn_drop(attn)
56
+
57
+ x = (attn @ v).transpose(1, 2).reshape(B, N, C)
58
+ x = self.proj(x)
59
+ x = self.proj_drop(x)
60
+ return x
61
+
62
+
63
+ class MemEffAttention(Attention):
64
+ def forward(self, x: Tensor, attn_bias=None) -> Tensor:
65
+ if not XFORMERS_AVAILABLE:
66
+ assert attn_bias is None, "xFormers is required for nested tensors usage"
67
+ return super().forward(x)
68
+
69
+ B, N, C = x.shape
70
+ qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, C // self.num_heads)
71
+
72
+ q, k, v = unbind(qkv, 2)
73
+
74
+ x = memory_efficient_attention(q, k, v, attn_bias=attn_bias)
75
+ x = x.reshape([B, N, C])
76
+
77
+ x = self.proj(x)
78
+ x = self.proj_drop(x)
79
+ return x
vace/annotators/depth_anything_v2/layers/block.py ADDED
@@ -0,0 +1,252 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
10
+
11
+ import logging
12
+ from typing import Callable, List, Any, Tuple, Dict
13
+
14
+ import torch
15
+ from torch import nn, Tensor
16
+
17
+ from .attention import Attention, MemEffAttention
18
+ from .drop_path import DropPath
19
+ from .layer_scale import LayerScale
20
+ from .mlp import Mlp
21
+
22
+
23
+ logger = logging.getLogger("dinov2")
24
+
25
+
26
+ try:
27
+ from xformers.ops import fmha
28
+ from xformers.ops import scaled_index_add, index_select_cat
29
+
30
+ XFORMERS_AVAILABLE = True
31
+ except ImportError:
32
+ # logger.warning("xFormers not available")
33
+ XFORMERS_AVAILABLE = False
34
+
35
+
36
+ class Block(nn.Module):
37
+ def __init__(
38
+ self,
39
+ dim: int,
40
+ num_heads: int,
41
+ mlp_ratio: float = 4.0,
42
+ qkv_bias: bool = False,
43
+ proj_bias: bool = True,
44
+ ffn_bias: bool = True,
45
+ drop: float = 0.0,
46
+ attn_drop: float = 0.0,
47
+ init_values=None,
48
+ drop_path: float = 0.0,
49
+ act_layer: Callable[..., nn.Module] = nn.GELU,
50
+ norm_layer: Callable[..., nn.Module] = nn.LayerNorm,
51
+ attn_class: Callable[..., nn.Module] = Attention,
52
+ ffn_layer: Callable[..., nn.Module] = Mlp,
53
+ ) -> None:
54
+ super().__init__()
55
+ # print(f"biases: qkv: {qkv_bias}, proj: {proj_bias}, ffn: {ffn_bias}")
56
+ self.norm1 = norm_layer(dim)
57
+ self.attn = attn_class(
58
+ dim,
59
+ num_heads=num_heads,
60
+ qkv_bias=qkv_bias,
61
+ proj_bias=proj_bias,
62
+ attn_drop=attn_drop,
63
+ proj_drop=drop,
64
+ )
65
+ self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
66
+ self.drop_path1 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
67
+
68
+ self.norm2 = norm_layer(dim)
69
+ mlp_hidden_dim = int(dim * mlp_ratio)
70
+ self.mlp = ffn_layer(
71
+ in_features=dim,
72
+ hidden_features=mlp_hidden_dim,
73
+ act_layer=act_layer,
74
+ drop=drop,
75
+ bias=ffn_bias,
76
+ )
77
+ self.ls2 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity()
78
+ self.drop_path2 = DropPath(drop_path) if drop_path > 0.0 else nn.Identity()
79
+
80
+ self.sample_drop_ratio = drop_path
81
+
82
+ def forward(self, x: Tensor) -> Tensor:
83
+ def attn_residual_func(x: Tensor) -> Tensor:
84
+ return self.ls1(self.attn(self.norm1(x)))
85
+
86
+ def ffn_residual_func(x: Tensor) -> Tensor:
87
+ return self.ls2(self.mlp(self.norm2(x)))
88
+
89
+ if self.training and self.sample_drop_ratio > 0.1:
90
+ # the overhead is compensated only for a drop path rate larger than 0.1
91
+ x = drop_add_residual_stochastic_depth(
92
+ x,
93
+ residual_func=attn_residual_func,
94
+ sample_drop_ratio=self.sample_drop_ratio,
95
+ )
96
+ x = drop_add_residual_stochastic_depth(
97
+ x,
98
+ residual_func=ffn_residual_func,
99
+ sample_drop_ratio=self.sample_drop_ratio,
100
+ )
101
+ elif self.training and self.sample_drop_ratio > 0.0:
102
+ x = x + self.drop_path1(attn_residual_func(x))
103
+ x = x + self.drop_path1(ffn_residual_func(x)) # FIXME: drop_path2
104
+ else:
105
+ x = x + attn_residual_func(x)
106
+ x = x + ffn_residual_func(x)
107
+ return x
108
+
109
+
110
+ def drop_add_residual_stochastic_depth(
111
+ x: Tensor,
112
+ residual_func: Callable[[Tensor], Tensor],
113
+ sample_drop_ratio: float = 0.0,
114
+ ) -> Tensor:
115
+ # 1) extract subset using permutation
116
+ b, n, d = x.shape
117
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
118
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
119
+ x_subset = x[brange]
120
+
121
+ # 2) apply residual_func to get residual
122
+ residual = residual_func(x_subset)
123
+
124
+ x_flat = x.flatten(1)
125
+ residual = residual.flatten(1)
126
+
127
+ residual_scale_factor = b / sample_subset_size
128
+
129
+ # 3) add the residual
130
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
131
+ return x_plus_residual.view_as(x)
132
+
133
+
134
+ def get_branges_scales(x, sample_drop_ratio=0.0):
135
+ b, n, d = x.shape
136
+ sample_subset_size = max(int(b * (1 - sample_drop_ratio)), 1)
137
+ brange = (torch.randperm(b, device=x.device))[:sample_subset_size]
138
+ residual_scale_factor = b / sample_subset_size
139
+ return brange, residual_scale_factor
140
+
141
+
142
+ def add_residual(x, brange, residual, residual_scale_factor, scaling_vector=None):
143
+ if scaling_vector is None:
144
+ x_flat = x.flatten(1)
145
+ residual = residual.flatten(1)
146
+ x_plus_residual = torch.index_add(x_flat, 0, brange, residual.to(dtype=x.dtype), alpha=residual_scale_factor)
147
+ else:
148
+ x_plus_residual = scaled_index_add(
149
+ x, brange, residual.to(dtype=x.dtype), scaling=scaling_vector, alpha=residual_scale_factor
150
+ )
151
+ return x_plus_residual
152
+
153
+
154
+ attn_bias_cache: Dict[Tuple, Any] = {}
155
+
156
+
157
+ def get_attn_bias_and_cat(x_list, branges=None):
158
+ """
159
+ this will perform the index select, cat the tensors, and provide the attn_bias from cache
160
+ """
161
+ batch_sizes = [b.shape[0] for b in branges] if branges is not None else [x.shape[0] for x in x_list]
162
+ all_shapes = tuple((b, x.shape[1]) for b, x in zip(batch_sizes, x_list))
163
+ if all_shapes not in attn_bias_cache.keys():
164
+ seqlens = []
165
+ for b, x in zip(batch_sizes, x_list):
166
+ for _ in range(b):
167
+ seqlens.append(x.shape[1])
168
+ attn_bias = fmha.BlockDiagonalMask.from_seqlens(seqlens)
169
+ attn_bias._batch_sizes = batch_sizes
170
+ attn_bias_cache[all_shapes] = attn_bias
171
+
172
+ if branges is not None:
173
+ cat_tensors = index_select_cat([x.flatten(1) for x in x_list], branges).view(1, -1, x_list[0].shape[-1])
174
+ else:
175
+ tensors_bs1 = tuple(x.reshape([1, -1, *x.shape[2:]]) for x in x_list)
176
+ cat_tensors = torch.cat(tensors_bs1, dim=1)
177
+
178
+ return attn_bias_cache[all_shapes], cat_tensors
179
+
180
+
181
+ def drop_add_residual_stochastic_depth_list(
182
+ x_list: List[Tensor],
183
+ residual_func: Callable[[Tensor, Any], Tensor],
184
+ sample_drop_ratio: float = 0.0,
185
+ scaling_vector=None,
186
+ ) -> Tensor:
187
+ # 1) generate random set of indices for dropping samples in the batch
188
+ branges_scales = [get_branges_scales(x, sample_drop_ratio=sample_drop_ratio) for x in x_list]
189
+ branges = [s[0] for s in branges_scales]
190
+ residual_scale_factors = [s[1] for s in branges_scales]
191
+
192
+ # 2) get attention bias and index+concat the tensors
193
+ attn_bias, x_cat = get_attn_bias_and_cat(x_list, branges)
194
+
195
+ # 3) apply residual_func to get residual, and split the result
196
+ residual_list = attn_bias.split(residual_func(x_cat, attn_bias=attn_bias)) # type: ignore
197
+
198
+ outputs = []
199
+ for x, brange, residual, residual_scale_factor in zip(x_list, branges, residual_list, residual_scale_factors):
200
+ outputs.append(add_residual(x, brange, residual, residual_scale_factor, scaling_vector).view_as(x))
201
+ return outputs
202
+
203
+
204
+ class NestedTensorBlock(Block):
205
+ def forward_nested(self, x_list: List[Tensor]) -> List[Tensor]:
206
+ """
207
+ x_list contains a list of tensors to nest together and run
208
+ """
209
+ assert isinstance(self.attn, MemEffAttention)
210
+
211
+ if self.training and self.sample_drop_ratio > 0.0:
212
+
213
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
214
+ return self.attn(self.norm1(x), attn_bias=attn_bias)
215
+
216
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
217
+ return self.mlp(self.norm2(x))
218
+
219
+ x_list = drop_add_residual_stochastic_depth_list(
220
+ x_list,
221
+ residual_func=attn_residual_func,
222
+ sample_drop_ratio=self.sample_drop_ratio,
223
+ scaling_vector=self.ls1.gamma if isinstance(self.ls1, LayerScale) else None,
224
+ )
225
+ x_list = drop_add_residual_stochastic_depth_list(
226
+ x_list,
227
+ residual_func=ffn_residual_func,
228
+ sample_drop_ratio=self.sample_drop_ratio,
229
+ scaling_vector=self.ls2.gamma if isinstance(self.ls1, LayerScale) else None,
230
+ )
231
+ return x_list
232
+ else:
233
+
234
+ def attn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
235
+ return self.ls1(self.attn(self.norm1(x), attn_bias=attn_bias))
236
+
237
+ def ffn_residual_func(x: Tensor, attn_bias=None) -> Tensor:
238
+ return self.ls2(self.mlp(self.norm2(x)))
239
+
240
+ attn_bias, x = get_attn_bias_and_cat(x_list)
241
+ x = x + attn_residual_func(x, attn_bias=attn_bias)
242
+ x = x + ffn_residual_func(x)
243
+ return attn_bias.split(x)
244
+
245
+ def forward(self, x_or_x_list):
246
+ if isinstance(x_or_x_list, Tensor):
247
+ return super().forward(x_or_x_list)
248
+ elif isinstance(x_or_x_list, list):
249
+ assert XFORMERS_AVAILABLE, "Please install xFormers for nested tensors usage"
250
+ return self.forward_nested(x_or_x_list)
251
+ else:
252
+ raise AssertionError
vace/annotators/depth_anything_v2/layers/drop_path.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/drop.py
10
+
11
+ from torch import nn
12
+
13
+
14
+ def drop_path(x, drop_prob: float = 0.0, training: bool = False):
15
+ if drop_prob == 0.0 or not training:
16
+ return x
17
+ keep_prob = 1 - drop_prob
18
+ shape = (x.shape[0],) + (1,) * (x.ndim - 1) # work with diff dim tensors, not just 2D ConvNets
19
+ random_tensor = x.new_empty(shape).bernoulli_(keep_prob)
20
+ if keep_prob > 0.0:
21
+ random_tensor.div_(keep_prob)
22
+ output = x * random_tensor
23
+ return output
24
+
25
+
26
+ class DropPath(nn.Module):
27
+ """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
28
+
29
+ def __init__(self, drop_prob=None):
30
+ super(DropPath, self).__init__()
31
+ self.drop_prob = drop_prob
32
+
33
+ def forward(self, x):
34
+ return drop_path(x, self.drop_prob, self.training)
vace/annotators/depth_anything_v2/layers/layer_scale.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+ # Modified from: https://github.com/huggingface/pytorch-image-models/blob/main/timm/models/vision_transformer.py
7
+
8
+
9
+ from typing import Union
10
+
11
+ import torch
12
+ from torch import Tensor
13
+ from torch import nn
14
+
15
+
16
+ class LayerScale(nn.Module):
17
+ def __init__(
18
+ self,
19
+ dim: int,
20
+ init_values: Union[float, Tensor] = 1e-5,
21
+ inplace: bool = False,
22
+ ) -> None:
23
+ super().__init__()
24
+ self.inplace = inplace
25
+ self.gamma = nn.Parameter(init_values * torch.ones(dim))
26
+
27
+ def forward(self, x: Tensor) -> Tensor:
28
+ return x.mul_(self.gamma) if self.inplace else x * self.gamma
vace/annotators/depth_anything_v2/layers/mlp.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
2
+ # All rights reserved.
3
+ #
4
+ # This source code is licensed under the license found in the
5
+ # LICENSE file in the root directory of this source tree.
6
+
7
+ # References:
8
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
9
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/mlp.py
10
+
11
+ from typing import Callable, Optional
12
+ from torch import Tensor, nn
13
+
14
+
15
+ class Mlp(nn.Module):
16
+ def __init__(
17
+ self,
18
+ in_features: int,
19
+ hidden_features: Optional[int] = None,
20
+ out_features: Optional[int] = None,
21
+ act_layer: Callable[..., nn.Module] = nn.GELU,
22
+ drop: float = 0.0,
23
+ bias: bool = True,
24
+ ) -> None:
25
+ super().__init__()
26
+ out_features = out_features or in_features
27
+ hidden_features = hidden_features or in_features
28
+ self.fc1 = nn.Linear(in_features, hidden_features, bias=bias)
29
+ self.act = act_layer()
30
+ self.fc2 = nn.Linear(hidden_features, out_features, bias=bias)
31
+ self.drop = nn.Dropout(drop)
32
+
33
+ def forward(self, x: Tensor) -> Tensor:
34
+ x = self.fc1(x)
35
+ x = self.act(x)
36
+ x = self.drop(x)
37
+ x = self.fc2(x)
38
+ x = self.drop(x)
39
+ return x
vace/annotators/depth_anything_v2/layers/patch_embed.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ # References:
9
+ # https://github.com/facebookresearch/dino/blob/master/vision_transformer.py
10
+ # https://github.com/rwightman/pytorch-image-models/tree/master/timm/layers/patch_embed.py
11
+
12
+ from typing import Callable, Optional, Tuple, Union
13
+
14
+ from torch import Tensor
15
+ import torch.nn as nn
16
+
17
+
18
+ def make_2tuple(x):
19
+ if isinstance(x, tuple):
20
+ assert len(x) == 2
21
+ return x
22
+
23
+ assert isinstance(x, int)
24
+ return (x, x)
25
+
26
+
27
+ class PatchEmbed(nn.Module):
28
+ """
29
+ 2D image to patch embedding: (B,C,H,W) -> (B,N,D)
30
+
31
+ Args:
32
+ img_size: Image size.
33
+ patch_size: Patch token size.
34
+ in_chans: Number of input image channels.
35
+ embed_dim: Number of linear projection output channels.
36
+ norm_layer: Normalization layer.
37
+ """
38
+
39
+ def __init__(
40
+ self,
41
+ img_size: Union[int, Tuple[int, int]] = 224,
42
+ patch_size: Union[int, Tuple[int, int]] = 16,
43
+ in_chans: int = 3,
44
+ embed_dim: int = 768,
45
+ norm_layer: Optional[Callable] = None,
46
+ flatten_embedding: bool = True,
47
+ ) -> None:
48
+ super().__init__()
49
+
50
+ image_HW = make_2tuple(img_size)
51
+ patch_HW = make_2tuple(patch_size)
52
+ patch_grid_size = (
53
+ image_HW[0] // patch_HW[0],
54
+ image_HW[1] // patch_HW[1],
55
+ )
56
+
57
+ self.img_size = image_HW
58
+ self.patch_size = patch_HW
59
+ self.patches_resolution = patch_grid_size
60
+ self.num_patches = patch_grid_size[0] * patch_grid_size[1]
61
+
62
+ self.in_chans = in_chans
63
+ self.embed_dim = embed_dim
64
+
65
+ self.flatten_embedding = flatten_embedding
66
+
67
+ self.proj = nn.Conv2d(in_chans, embed_dim, kernel_size=patch_HW, stride=patch_HW)
68
+ self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity()
69
+
70
+ def forward(self, x: Tensor) -> Tensor:
71
+ _, _, H, W = x.shape
72
+ patch_H, patch_W = self.patch_size
73
+
74
+ assert H % patch_H == 0, f"Input image height {H} is not a multiple of patch height {patch_H}"
75
+ assert W % patch_W == 0, f"Input image width {W} is not a multiple of patch width: {patch_W}"
76
+
77
+ x = self.proj(x) # B C H W
78
+ H, W = x.size(2), x.size(3)
79
+ x = x.flatten(2).transpose(1, 2) # B HW C
80
+ x = self.norm(x)
81
+ if not self.flatten_embedding:
82
+ x = x.reshape(-1, H, W, self.embed_dim) # B H W C
83
+ return x
84
+
85
+ def flops(self) -> float:
86
+ Ho, Wo = self.patches_resolution
87
+ flops = Ho * Wo * self.embed_dim * self.in_chans * (self.patch_size[0] * self.patch_size[1])
88
+ if self.norm is not None:
89
+ flops += Ho * Wo * self.embed_dim
90
+ return flops
vace/annotators/depth_anything_v2/layers/swiglu_ffn.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # Copyright (c) Meta Platforms, Inc. and affiliates.
3
+ # All rights reserved.
4
+ #
5
+ # This source code is licensed under the license found in the
6
+ # LICENSE file in the root directory of this source tree.
7
+
8
+ from typing import Callable, Optional
9
+
10
+ from torch import Tensor, nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ class SwiGLUFFN(nn.Module):
15
+ def __init__(
16
+ self,
17
+ in_features: int,
18
+ hidden_features: Optional[int] = None,
19
+ out_features: Optional[int] = None,
20
+ act_layer: Callable[..., nn.Module] = None,
21
+ drop: float = 0.0,
22
+ bias: bool = True,
23
+ ) -> None:
24
+ super().__init__()
25
+ out_features = out_features or in_features
26
+ hidden_features = hidden_features or in_features
27
+ self.w12 = nn.Linear(in_features, 2 * hidden_features, bias=bias)
28
+ self.w3 = nn.Linear(hidden_features, out_features, bias=bias)
29
+
30
+ def forward(self, x: Tensor) -> Tensor:
31
+ x12 = self.w12(x)
32
+ x1, x2 = x12.chunk(2, dim=-1)
33
+ hidden = F.silu(x1) * x2
34
+ return self.w3(hidden)
35
+
36
+
37
+ try:
38
+ from xformers.ops import SwiGLU
39
+
40
+ XFORMERS_AVAILABLE = True
41
+ except ImportError:
42
+ SwiGLU = SwiGLUFFN
43
+ XFORMERS_AVAILABLE = False
44
+
45
+
46
+ class SwiGLUFFNFused(SwiGLU):
47
+ def __init__(
48
+ self,
49
+ in_features: int,
50
+ hidden_features: Optional[int] = None,
51
+ out_features: Optional[int] = None,
52
+ act_layer: Callable[..., nn.Module] = None,
53
+ drop: float = 0.0,
54
+ bias: bool = True,
55
+ ) -> None:
56
+ out_features = out_features or in_features
57
+ hidden_features = hidden_features or in_features
58
+ hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
59
+ super().__init__(
60
+ in_features=in_features,
61
+ hidden_features=hidden_features,
62
+ out_features=out_features,
63
+ bias=bias,
64
+ )
vace/annotators/depth_anything_v2/util/__init__.py ADDED
File without changes
vace/annotators/depth_anything_v2/util/blocks.py ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+
3
+
4
+ def _make_scratch(in_shape, out_shape, groups=1, expand=False):
5
+ scratch = nn.Module()
6
+
7
+ out_shape1 = out_shape
8
+ out_shape2 = out_shape
9
+ out_shape3 = out_shape
10
+ if len(in_shape) >= 4:
11
+ out_shape4 = out_shape
12
+
13
+ if expand:
14
+ out_shape1 = out_shape
15
+ out_shape2 = out_shape * 2
16
+ out_shape3 = out_shape * 4
17
+ if len(in_shape) >= 4:
18
+ out_shape4 = out_shape * 8
19
+
20
+ scratch.layer1_rn = nn.Conv2d(in_shape[0], out_shape1, kernel_size=3, stride=1, padding=1, bias=False,
21
+ groups=groups)
22
+ scratch.layer2_rn = nn.Conv2d(in_shape[1], out_shape2, kernel_size=3, stride=1, padding=1, bias=False,
23
+ groups=groups)
24
+ scratch.layer3_rn = nn.Conv2d(in_shape[2], out_shape3, kernel_size=3, stride=1, padding=1, bias=False,
25
+ groups=groups)
26
+ if len(in_shape) >= 4:
27
+ scratch.layer4_rn = nn.Conv2d(in_shape[3], out_shape4, kernel_size=3, stride=1, padding=1, bias=False,
28
+ groups=groups)
29
+
30
+ return scratch
31
+
32
+
33
+ class ResidualConvUnit(nn.Module):
34
+ """Residual convolution module.
35
+ """
36
+
37
+ def __init__(self, features, activation, bn):
38
+ """Init.
39
+
40
+ Args:
41
+ features (int): number of features
42
+ """
43
+ super().__init__()
44
+
45
+ self.bn = bn
46
+
47
+ self.groups = 1
48
+
49
+ self.conv1 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
50
+
51
+ self.conv2 = nn.Conv2d(features, features, kernel_size=3, stride=1, padding=1, bias=True, groups=self.groups)
52
+
53
+ if self.bn == True:
54
+ self.bn1 = nn.BatchNorm2d(features)
55
+ self.bn2 = nn.BatchNorm2d(features)
56
+
57
+ self.activation = activation
58
+
59
+ self.skip_add = nn.quantized.FloatFunctional()
60
+
61
+ def forward(self, x):
62
+ """Forward pass.
63
+
64
+ Args:
65
+ x (tensor): input
66
+
67
+ Returns:
68
+ tensor: output
69
+ """
70
+
71
+ out = self.activation(x)
72
+ out = self.conv1(out)
73
+ if self.bn == True:
74
+ out = self.bn1(out)
75
+
76
+ out = self.activation(out)
77
+ out = self.conv2(out)
78
+ if self.bn == True:
79
+ out = self.bn2(out)
80
+
81
+ if self.groups > 1:
82
+ out = self.conv_merge(out)
83
+
84
+ return self.skip_add.add(out, x)
85
+
86
+
87
+ class FeatureFusionBlock(nn.Module):
88
+ """Feature fusion block.
89
+ """
90
+
91
+ def __init__(
92
+ self,
93
+ features,
94
+ activation,
95
+ deconv=False,
96
+ bn=False,
97
+ expand=False,
98
+ align_corners=True,
99
+ size=None
100
+ ):
101
+ """Init.
102
+
103
+ Args:
104
+ features (int): number of features
105
+ """
106
+ super(FeatureFusionBlock, self).__init__()
107
+
108
+ self.deconv = deconv
109
+ self.align_corners = align_corners
110
+
111
+ self.groups = 1
112
+
113
+ self.expand = expand
114
+ out_features = features
115
+ if self.expand == True:
116
+ out_features = features // 2
117
+
118
+ self.out_conv = nn.Conv2d(features, out_features, kernel_size=1, stride=1, padding=0, bias=True, groups=1)
119
+
120
+ self.resConfUnit1 = ResidualConvUnit(features, activation, bn)
121
+ self.resConfUnit2 = ResidualConvUnit(features, activation, bn)
122
+
123
+ self.skip_add = nn.quantized.FloatFunctional()
124
+
125
+ self.size = size
126
+
127
+ def forward(self, *xs, size=None):
128
+ """Forward pass.
129
+
130
+ Returns:
131
+ tensor: output
132
+ """
133
+ output = xs[0]
134
+
135
+ if len(xs) == 2:
136
+ res = self.resConfUnit1(xs[1])
137
+ output = self.skip_add.add(output, res)
138
+
139
+ output = self.resConfUnit2(output)
140
+
141
+ if (size is None) and (self.size is None):
142
+ modifier = {"scale_factor": 2}
143
+ elif size is None:
144
+ modifier = {"size": self.size}
145
+ else:
146
+ modifier = {"size": size}
147
+
148
+ output = nn.functional.interpolate(output, **modifier, mode="bilinear", align_corners=self.align_corners)
149
+ output = self.out_conv(output)
150
+
151
+ return output
vace/annotators/depth_anything_v2/util/transform.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+
4
+
5
+ class Resize(object):
6
+ """Resize sample to given size (width, height).
7
+ """
8
+
9
+ def __init__(
10
+ self,
11
+ width,
12
+ height,
13
+ resize_target=True,
14
+ keep_aspect_ratio=False,
15
+ ensure_multiple_of=1,
16
+ resize_method="lower_bound",
17
+ image_interpolation_method=cv2.INTER_AREA,
18
+ ):
19
+ """Init.
20
+
21
+ Args:
22
+ width (int): desired output width
23
+ height (int): desired output height
24
+ resize_target (bool, optional):
25
+ True: Resize the full sample (image, mask, target).
26
+ False: Resize image only.
27
+ Defaults to True.
28
+ keep_aspect_ratio (bool, optional):
29
+ True: Keep the aspect ratio of the input sample.
30
+ Output sample might not have the given width and height, and
31
+ resize behaviour depends on the parameter 'resize_method'.
32
+ Defaults to False.
33
+ ensure_multiple_of (int, optional):
34
+ Output width and height is constrained to be multiple of this parameter.
35
+ Defaults to 1.
36
+ resize_method (str, optional):
37
+ "lower_bound": Output will be at least as large as the given size.
38
+ "upper_bound": Output will be at max as large as the given size. (Output size might be smaller than given size.)
39
+ "minimal": Scale as least as possible. (Output size might be smaller than given size.)
40
+ Defaults to "lower_bound".
41
+ """
42
+ self.__width = width
43
+ self.__height = height
44
+
45
+ self.__resize_target = resize_target
46
+ self.__keep_aspect_ratio = keep_aspect_ratio
47
+ self.__multiple_of = ensure_multiple_of
48
+ self.__resize_method = resize_method
49
+ self.__image_interpolation_method = image_interpolation_method
50
+
51
+ def constrain_to_multiple_of(self, x, min_val=0, max_val=None):
52
+ y = (np.round(x / self.__multiple_of) * self.__multiple_of).astype(int)
53
+
54
+ if max_val is not None and y > max_val:
55
+ y = (np.floor(x / self.__multiple_of) * self.__multiple_of).astype(int)
56
+
57
+ if y < min_val:
58
+ y = (np.ceil(x / self.__multiple_of) * self.__multiple_of).astype(int)
59
+
60
+ return y
61
+
62
+ def get_size(self, width, height):
63
+ # determine new height and width
64
+ scale_height = self.__height / height
65
+ scale_width = self.__width / width
66
+
67
+ if self.__keep_aspect_ratio:
68
+ if self.__resize_method == "lower_bound":
69
+ # scale such that output size is lower bound
70
+ if scale_width > scale_height:
71
+ # fit width
72
+ scale_height = scale_width
73
+ else:
74
+ # fit height
75
+ scale_width = scale_height
76
+ elif self.__resize_method == "upper_bound":
77
+ # scale such that output size is upper bound
78
+ if scale_width < scale_height:
79
+ # fit width
80
+ scale_height = scale_width
81
+ else:
82
+ # fit height
83
+ scale_width = scale_height
84
+ elif self.__resize_method == "minimal":
85
+ # scale as least as possbile
86
+ if abs(1 - scale_width) < abs(1 - scale_height):
87
+ # fit width
88
+ scale_height = scale_width
89
+ else:
90
+ # fit height
91
+ scale_width = scale_height
92
+ else:
93
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
94
+
95
+ if self.__resize_method == "lower_bound":
96
+ new_height = self.constrain_to_multiple_of(scale_height * height, min_val=self.__height)
97
+ new_width = self.constrain_to_multiple_of(scale_width * width, min_val=self.__width)
98
+ elif self.__resize_method == "upper_bound":
99
+ new_height = self.constrain_to_multiple_of(scale_height * height, max_val=self.__height)
100
+ new_width = self.constrain_to_multiple_of(scale_width * width, max_val=self.__width)
101
+ elif self.__resize_method == "minimal":
102
+ new_height = self.constrain_to_multiple_of(scale_height * height)
103
+ new_width = self.constrain_to_multiple_of(scale_width * width)
104
+ else:
105
+ raise ValueError(f"resize_method {self.__resize_method} not implemented")
106
+
107
+ return (new_width, new_height)
108
+
109
+ def __call__(self, sample):
110
+ width, height = self.get_size(sample["image"].shape[1], sample["image"].shape[0])
111
+
112
+ # resize sample
113
+ sample["image"] = cv2.resize(sample["image"], (width, height), interpolation=self.__image_interpolation_method)
114
+
115
+ if self.__resize_target:
116
+ if "depth" in sample:
117
+ sample["depth"] = cv2.resize(sample["depth"], (width, height), interpolation=cv2.INTER_NEAREST)
118
+
119
+ if "mask" in sample:
120
+ sample["mask"] = cv2.resize(sample["mask"].astype(np.float32), (width, height),
121
+ interpolation=cv2.INTER_NEAREST)
122
+
123
+ return sample
124
+
125
+
126
+ class NormalizeImage(object):
127
+ """Normlize image by given mean and std.
128
+ """
129
+
130
+ def __init__(self, mean, std):
131
+ self.__mean = mean
132
+ self.__std = std
133
+
134
+ def __call__(self, sample):
135
+ sample["image"] = (sample["image"] - self.__mean) / self.__std
136
+
137
+ return sample
138
+
139
+
140
+ class PrepareForNet(object):
141
+ """Prepare sample for usage as network input.
142
+ """
143
+
144
+ def __init__(self):
145
+ pass
146
+
147
+ def __call__(self, sample):
148
+ image = np.transpose(sample["image"], (2, 0, 1))
149
+ sample["image"] = np.ascontiguousarray(image).astype(np.float32)
150
+
151
+ if "depth" in sample:
152
+ depth = sample["depth"].astype(np.float32)
153
+ sample["depth"] = np.ascontiguousarray(depth)
154
+
155
+ if "mask" in sample:
156
+ sample["mask"] = sample["mask"].astype(np.float32)
157
+ sample["mask"] = np.ascontiguousarray(sample["mask"])
158
+
159
+ return sample
vace/annotators/dwpose/__init__.py ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
vace/annotators/dwpose/onnxdet.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ import cv2
4
+ import numpy as np
5
+
6
+ import onnxruntime
7
+
8
+ def nms(boxes, scores, nms_thr):
9
+ """Single class NMS implemented in Numpy."""
10
+ x1 = boxes[:, 0]
11
+ y1 = boxes[:, 1]
12
+ x2 = boxes[:, 2]
13
+ y2 = boxes[:, 3]
14
+
15
+ areas = (x2 - x1 + 1) * (y2 - y1 + 1)
16
+ order = scores.argsort()[::-1]
17
+
18
+ keep = []
19
+ while order.size > 0:
20
+ i = order[0]
21
+ keep.append(i)
22
+ xx1 = np.maximum(x1[i], x1[order[1:]])
23
+ yy1 = np.maximum(y1[i], y1[order[1:]])
24
+ xx2 = np.minimum(x2[i], x2[order[1:]])
25
+ yy2 = np.minimum(y2[i], y2[order[1:]])
26
+
27
+ w = np.maximum(0.0, xx2 - xx1 + 1)
28
+ h = np.maximum(0.0, yy2 - yy1 + 1)
29
+ inter = w * h
30
+ ovr = inter / (areas[i] + areas[order[1:]] - inter)
31
+
32
+ inds = np.where(ovr <= nms_thr)[0]
33
+ order = order[inds + 1]
34
+
35
+ return keep
36
+
37
+ def multiclass_nms(boxes, scores, nms_thr, score_thr):
38
+ """Multiclass NMS implemented in Numpy. Class-aware version."""
39
+ final_dets = []
40
+ num_classes = scores.shape[1]
41
+ for cls_ind in range(num_classes):
42
+ cls_scores = scores[:, cls_ind]
43
+ valid_score_mask = cls_scores > score_thr
44
+ if valid_score_mask.sum() == 0:
45
+ continue
46
+ else:
47
+ valid_scores = cls_scores[valid_score_mask]
48
+ valid_boxes = boxes[valid_score_mask]
49
+ keep = nms(valid_boxes, valid_scores, nms_thr)
50
+ if len(keep) > 0:
51
+ cls_inds = np.ones((len(keep), 1)) * cls_ind
52
+ dets = np.concatenate(
53
+ [valid_boxes[keep], valid_scores[keep, None], cls_inds], 1
54
+ )
55
+ final_dets.append(dets)
56
+ if len(final_dets) == 0:
57
+ return None
58
+ return np.concatenate(final_dets, 0)
59
+
60
+ def demo_postprocess(outputs, img_size, p6=False):
61
+ grids = []
62
+ expanded_strides = []
63
+ strides = [8, 16, 32] if not p6 else [8, 16, 32, 64]
64
+
65
+ hsizes = [img_size[0] // stride for stride in strides]
66
+ wsizes = [img_size[1] // stride for stride in strides]
67
+
68
+ for hsize, wsize, stride in zip(hsizes, wsizes, strides):
69
+ xv, yv = np.meshgrid(np.arange(wsize), np.arange(hsize))
70
+ grid = np.stack((xv, yv), 2).reshape(1, -1, 2)
71
+ grids.append(grid)
72
+ shape = grid.shape[:2]
73
+ expanded_strides.append(np.full((*shape, 1), stride))
74
+
75
+ grids = np.concatenate(grids, 1)
76
+ expanded_strides = np.concatenate(expanded_strides, 1)
77
+ outputs[..., :2] = (outputs[..., :2] + grids) * expanded_strides
78
+ outputs[..., 2:4] = np.exp(outputs[..., 2:4]) * expanded_strides
79
+
80
+ return outputs
81
+
82
+ def preprocess(img, input_size, swap=(2, 0, 1)):
83
+ if len(img.shape) == 3:
84
+ padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
85
+ else:
86
+ padded_img = np.ones(input_size, dtype=np.uint8) * 114
87
+
88
+ r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
89
+ resized_img = cv2.resize(
90
+ img,
91
+ (int(img.shape[1] * r), int(img.shape[0] * r)),
92
+ interpolation=cv2.INTER_LINEAR,
93
+ ).astype(np.uint8)
94
+ padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
95
+
96
+ padded_img = padded_img.transpose(swap)
97
+ padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
98
+ return padded_img, r
99
+
100
+ def inference_detector(session, oriImg):
101
+ input_shape = (640,640)
102
+ img, ratio = preprocess(oriImg, input_shape)
103
+
104
+ ort_inputs = {session.get_inputs()[0].name: img[None, :, :, :]}
105
+ output = session.run(None, ort_inputs)
106
+ predictions = demo_postprocess(output[0], input_shape)[0]
107
+
108
+ boxes = predictions[:, :4]
109
+ scores = predictions[:, 4:5] * predictions[:, 5:]
110
+
111
+ boxes_xyxy = np.ones_like(boxes)
112
+ boxes_xyxy[:, 0] = boxes[:, 0] - boxes[:, 2]/2.
113
+ boxes_xyxy[:, 1] = boxes[:, 1] - boxes[:, 3]/2.
114
+ boxes_xyxy[:, 2] = boxes[:, 0] + boxes[:, 2]/2.
115
+ boxes_xyxy[:, 3] = boxes[:, 1] + boxes[:, 3]/2.
116
+ boxes_xyxy /= ratio
117
+ dets = multiclass_nms(boxes_xyxy, scores, nms_thr=0.45, score_thr=0.1)
118
+ if dets is not None:
119
+ final_boxes, final_scores, final_cls_inds = dets[:, :4], dets[:, 4], dets[:, 5]
120
+ isscore = final_scores>0.3
121
+ iscat = final_cls_inds == 0
122
+ isbbox = [ i and j for (i, j) in zip(isscore, iscat)]
123
+ final_boxes = final_boxes[isbbox]
124
+ else:
125
+ final_boxes = np.array([])
126
+
127
+ return final_boxes
vace/annotators/dwpose/onnxpose.py ADDED
@@ -0,0 +1,362 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Alibaba, Inc. and its affiliates.
3
+ from typing import List, Tuple
4
+
5
+ import cv2
6
+ import numpy as np
7
+ import onnxruntime as ort
8
+
9
+ def preprocess(
10
+ img: np.ndarray, out_bbox, input_size: Tuple[int, int] = (192, 256)
11
+ ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
12
+ """Do preprocessing for RTMPose model inference.
13
+
14
+ Args:
15
+ img (np.ndarray): Input image in shape.
16
+ input_size (tuple): Input image size in shape (w, h).
17
+
18
+ Returns:
19
+ tuple:
20
+ - resized_img (np.ndarray): Preprocessed image.
21
+ - center (np.ndarray): Center of image.
22
+ - scale (np.ndarray): Scale of image.
23
+ """
24
+ # get shape of image
25
+ img_shape = img.shape[:2]
26
+ out_img, out_center, out_scale = [], [], []
27
+ if len(out_bbox) == 0:
28
+ out_bbox = [[0, 0, img_shape[1], img_shape[0]]]
29
+ for i in range(len(out_bbox)):
30
+ x0 = out_bbox[i][0]
31
+ y0 = out_bbox[i][1]
32
+ x1 = out_bbox[i][2]
33
+ y1 = out_bbox[i][3]
34
+ bbox = np.array([x0, y0, x1, y1])
35
+
36
+ # get center and scale
37
+ center, scale = bbox_xyxy2cs(bbox, padding=1.25)
38
+
39
+ # do affine transformation
40
+ resized_img, scale = top_down_affine(input_size, scale, center, img)
41
+
42
+ # normalize image
43
+ mean = np.array([123.675, 116.28, 103.53])
44
+ std = np.array([58.395, 57.12, 57.375])
45
+ resized_img = (resized_img - mean) / std
46
+
47
+ out_img.append(resized_img)
48
+ out_center.append(center)
49
+ out_scale.append(scale)
50
+
51
+ return out_img, out_center, out_scale
52
+
53
+
54
+ def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
55
+ """Inference RTMPose model.
56
+
57
+ Args:
58
+ sess (ort.InferenceSession): ONNXRuntime session.
59
+ img (np.ndarray): Input image in shape.
60
+
61
+ Returns:
62
+ outputs (np.ndarray): Output of RTMPose model.
63
+ """
64
+ all_out = []
65
+ # build input
66
+ for i in range(len(img)):
67
+ input = [img[i].transpose(2, 0, 1)]
68
+
69
+ # build output
70
+ sess_input = {sess.get_inputs()[0].name: input}
71
+ sess_output = []
72
+ for out in sess.get_outputs():
73
+ sess_output.append(out.name)
74
+
75
+ # run model
76
+ outputs = sess.run(sess_output, sess_input)
77
+ all_out.append(outputs)
78
+
79
+ return all_out
80
+
81
+
82
+ def postprocess(outputs: List[np.ndarray],
83
+ model_input_size: Tuple[int, int],
84
+ center: Tuple[int, int],
85
+ scale: Tuple[int, int],
86
+ simcc_split_ratio: float = 2.0
87
+ ) -> Tuple[np.ndarray, np.ndarray]:
88
+ """Postprocess for RTMPose model output.
89
+
90
+ Args:
91
+ outputs (np.ndarray): Output of RTMPose model.
92
+ model_input_size (tuple): RTMPose model Input image size.
93
+ center (tuple): Center of bbox in shape (x, y).
94
+ scale (tuple): Scale of bbox in shape (w, h).
95
+ simcc_split_ratio (float): Split ratio of simcc.
96
+
97
+ Returns:
98
+ tuple:
99
+ - keypoints (np.ndarray): Rescaled keypoints.
100
+ - scores (np.ndarray): Model predict scores.
101
+ """
102
+ all_key = []
103
+ all_score = []
104
+ for i in range(len(outputs)):
105
+ # use simcc to decode
106
+ simcc_x, simcc_y = outputs[i]
107
+ keypoints, scores = decode(simcc_x, simcc_y, simcc_split_ratio)
108
+
109
+ # rescale keypoints
110
+ keypoints = keypoints / model_input_size * scale[i] + center[i] - scale[i] / 2
111
+ all_key.append(keypoints[0])
112
+ all_score.append(scores[0])
113
+
114
+ return np.array(all_key), np.array(all_score)
115
+
116
+
117
+ def bbox_xyxy2cs(bbox: np.ndarray,
118
+ padding: float = 1.) -> Tuple[np.ndarray, np.ndarray]:
119
+ """Transform the bbox format from (x,y,w,h) into (center, scale)
120
+
121
+ Args:
122
+ bbox (ndarray): Bounding box(es) in shape (4,) or (n, 4), formatted
123
+ as (left, top, right, bottom)
124
+ padding (float): BBox padding factor that will be multilied to scale.
125
+ Default: 1.0
126
+
127
+ Returns:
128
+ tuple: A tuple containing center and scale.
129
+ - np.ndarray[float32]: Center (x, y) of the bbox in shape (2,) or
130
+ (n, 2)
131
+ - np.ndarray[float32]: Scale (w, h) of the bbox in shape (2,) or
132
+ (n, 2)
133
+ """
134
+ # convert single bbox from (4, ) to (1, 4)
135
+ dim = bbox.ndim
136
+ if dim == 1:
137
+ bbox = bbox[None, :]
138
+
139
+ # get bbox center and scale
140
+ x1, y1, x2, y2 = np.hsplit(bbox, [1, 2, 3])
141
+ center = np.hstack([x1 + x2, y1 + y2]) * 0.5
142
+ scale = np.hstack([x2 - x1, y2 - y1]) * padding
143
+
144
+ if dim == 1:
145
+ center = center[0]
146
+ scale = scale[0]
147
+
148
+ return center, scale
149
+
150
+
151
+ def _fix_aspect_ratio(bbox_scale: np.ndarray,
152
+ aspect_ratio: float) -> np.ndarray:
153
+ """Extend the scale to match the given aspect ratio.
154
+
155
+ Args:
156
+ scale (np.ndarray): The image scale (w, h) in shape (2, )
157
+ aspect_ratio (float): The ratio of ``w/h``
158
+
159
+ Returns:
160
+ np.ndarray: The reshaped image scale in (2, )
161
+ """
162
+ w, h = np.hsplit(bbox_scale, [1])
163
+ bbox_scale = np.where(w > h * aspect_ratio,
164
+ np.hstack([w, w / aspect_ratio]),
165
+ np.hstack([h * aspect_ratio, h]))
166
+ return bbox_scale
167
+
168
+
169
+ def _rotate_point(pt: np.ndarray, angle_rad: float) -> np.ndarray:
170
+ """Rotate a point by an angle.
171
+
172
+ Args:
173
+ pt (np.ndarray): 2D point coordinates (x, y) in shape (2, )
174
+ angle_rad (float): rotation angle in radian
175
+
176
+ Returns:
177
+ np.ndarray: Rotated point in shape (2, )
178
+ """
179
+ sn, cs = np.sin(angle_rad), np.cos(angle_rad)
180
+ rot_mat = np.array([[cs, -sn], [sn, cs]])
181
+ return rot_mat @ pt
182
+
183
+
184
+ def _get_3rd_point(a: np.ndarray, b: np.ndarray) -> np.ndarray:
185
+ """To calculate the affine matrix, three pairs of points are required. This
186
+ function is used to get the 3rd point, given 2D points a & b.
187
+
188
+ The 3rd point is defined by rotating vector `a - b` by 90 degrees
189
+ anticlockwise, using b as the rotation center.
190
+
191
+ Args:
192
+ a (np.ndarray): The 1st point (x,y) in shape (2, )
193
+ b (np.ndarray): The 2nd point (x,y) in shape (2, )
194
+
195
+ Returns:
196
+ np.ndarray: The 3rd point.
197
+ """
198
+ direction = a - b
199
+ c = b + np.r_[-direction[1], direction[0]]
200
+ return c
201
+
202
+
203
+ def get_warp_matrix(center: np.ndarray,
204
+ scale: np.ndarray,
205
+ rot: float,
206
+ output_size: Tuple[int, int],
207
+ shift: Tuple[float, float] = (0., 0.),
208
+ inv: bool = False) -> np.ndarray:
209
+ """Calculate the affine transformation matrix that can warp the bbox area
210
+ in the input image to the output size.
211
+
212
+ Args:
213
+ center (np.ndarray[2, ]): Center of the bounding box (x, y).
214
+ scale (np.ndarray[2, ]): Scale of the bounding box
215
+ wrt [width, height].
216
+ rot (float): Rotation angle (degree).
217
+ output_size (np.ndarray[2, ] | list(2,)): Size of the
218
+ destination heatmaps.
219
+ shift (0-100%): Shift translation ratio wrt the width/height.
220
+ Default (0., 0.).
221
+ inv (bool): Option to inverse the affine transform direction.
222
+ (inv=False: src->dst or inv=True: dst->src)
223
+
224
+ Returns:
225
+ np.ndarray: A 2x3 transformation matrix
226
+ """
227
+ shift = np.array(shift)
228
+ src_w = scale[0]
229
+ dst_w = output_size[0]
230
+ dst_h = output_size[1]
231
+
232
+ # compute transformation matrix
233
+ rot_rad = np.deg2rad(rot)
234
+ src_dir = _rotate_point(np.array([0., src_w * -0.5]), rot_rad)
235
+ dst_dir = np.array([0., dst_w * -0.5])
236
+
237
+ # get four corners of the src rectangle in the original image
238
+ src = np.zeros((3, 2), dtype=np.float32)
239
+ src[0, :] = center + scale * shift
240
+ src[1, :] = center + src_dir + scale * shift
241
+ src[2, :] = _get_3rd_point(src[0, :], src[1, :])
242
+
243
+ # get four corners of the dst rectangle in the input image
244
+ dst = np.zeros((3, 2), dtype=np.float32)
245
+ dst[0, :] = [dst_w * 0.5, dst_h * 0.5]
246
+ dst[1, :] = np.array([dst_w * 0.5, dst_h * 0.5]) + dst_dir
247
+ dst[2, :] = _get_3rd_point(dst[0, :], dst[1, :])
248
+
249
+ if inv:
250
+ warp_mat = cv2.getAffineTransform(np.float32(dst), np.float32(src))
251
+ else:
252
+ warp_mat = cv2.getAffineTransform(np.float32(src), np.float32(dst))
253
+
254
+ return warp_mat
255
+
256
+
257
+ def top_down_affine(input_size: dict, bbox_scale: dict, bbox_center: dict,
258
+ img: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
259
+ """Get the bbox image as the model input by affine transform.
260
+
261
+ Args:
262
+ input_size (dict): The input size of the model.
263
+ bbox_scale (dict): The bbox scale of the img.
264
+ bbox_center (dict): The bbox center of the img.
265
+ img (np.ndarray): The original image.
266
+
267
+ Returns:
268
+ tuple: A tuple containing center and scale.
269
+ - np.ndarray[float32]: img after affine transform.
270
+ - np.ndarray[float32]: bbox scale after affine transform.
271
+ """
272
+ w, h = input_size
273
+ warp_size = (int(w), int(h))
274
+
275
+ # reshape bbox to fixed aspect ratio
276
+ bbox_scale = _fix_aspect_ratio(bbox_scale, aspect_ratio=w / h)
277
+
278
+ # get the affine matrix
279
+ center = bbox_center
280
+ scale = bbox_scale
281
+ rot = 0
282
+ warp_mat = get_warp_matrix(center, scale, rot, output_size=(w, h))
283
+
284
+ # do affine transform
285
+ img = cv2.warpAffine(img, warp_mat, warp_size, flags=cv2.INTER_LINEAR)
286
+
287
+ return img, bbox_scale
288
+
289
+
290
+ def get_simcc_maximum(simcc_x: np.ndarray,
291
+ simcc_y: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
292
+ """Get maximum response location and value from simcc representations.
293
+
294
+ Note:
295
+ instance number: N
296
+ num_keypoints: K
297
+ heatmap height: H
298
+ heatmap width: W
299
+
300
+ Args:
301
+ simcc_x (np.ndarray): x-axis SimCC in shape (K, Wx) or (N, K, Wx)
302
+ simcc_y (np.ndarray): y-axis SimCC in shape (K, Wy) or (N, K, Wy)
303
+
304
+ Returns:
305
+ tuple:
306
+ - locs (np.ndarray): locations of maximum heatmap responses in shape
307
+ (K, 2) or (N, K, 2)
308
+ - vals (np.ndarray): values of maximum heatmap responses in shape
309
+ (K,) or (N, K)
310
+ """
311
+ N, K, Wx = simcc_x.shape
312
+ simcc_x = simcc_x.reshape(N * K, -1)
313
+ simcc_y = simcc_y.reshape(N * K, -1)
314
+
315
+ # get maximum value locations
316
+ x_locs = np.argmax(simcc_x, axis=1)
317
+ y_locs = np.argmax(simcc_y, axis=1)
318
+ locs = np.stack((x_locs, y_locs), axis=-1).astype(np.float32)
319
+ max_val_x = np.amax(simcc_x, axis=1)
320
+ max_val_y = np.amax(simcc_y, axis=1)
321
+
322
+ # get maximum value across x and y axis
323
+ mask = max_val_x > max_val_y
324
+ max_val_x[mask] = max_val_y[mask]
325
+ vals = max_val_x
326
+ locs[vals <= 0.] = -1
327
+
328
+ # reshape
329
+ locs = locs.reshape(N, K, 2)
330
+ vals = vals.reshape(N, K)
331
+
332
+ return locs, vals
333
+
334
+
335
+ def decode(simcc_x: np.ndarray, simcc_y: np.ndarray,
336
+ simcc_split_ratio) -> Tuple[np.ndarray, np.ndarray]:
337
+ """Modulate simcc distribution with Gaussian.
338
+
339
+ Args:
340
+ simcc_x (np.ndarray[K, Wx]): model predicted simcc in x.
341
+ simcc_y (np.ndarray[K, Wy]): model predicted simcc in y.
342
+ simcc_split_ratio (int): The split ratio of simcc.
343
+
344
+ Returns:
345
+ tuple: A tuple containing center and scale.
346
+ - np.ndarray[float32]: keypoints in shape (K, 2) or (n, K, 2)
347
+ - np.ndarray[float32]: scores in shape (K,) or (n, K)
348
+ """
349
+ keypoints, scores = get_simcc_maximum(simcc_x, simcc_y)
350
+ keypoints /= simcc_split_ratio
351
+
352
+ return keypoints, scores
353
+
354
+
355
+ def inference_pose(session, out_bbox, oriImg):
356
+ h, w = session.get_inputs()[0].shape[2:]
357
+ model_input_size = (w, h)
358
+ resized_img, center, scale = preprocess(oriImg, out_bbox, model_input_size)
359
+ outputs = inference(session, resized_img)
360
+ keypoints, scores = postprocess(outputs, model_input_size, center, scale)
361
+
362
+ return keypoints, scores